In [2]:
import pandas as pd
import pyodbc
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [8]:
def fetch_data_from_sql(server, database, query):
    conn_str = (f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server};DATABASE={database};Trusted_Connection=yes;')
    conn = pyodbc.connect(conn_str)

    df = pd.read_sql_query(query, conn)
    conn.close()
    return df
   
query = """
    SELECT
    ReviewID    as 'Review ID',
    CustomerID  as 'Customer ID',
    ProductID   as 'Product ID',
    CONVERT( date, ReviewDate ) as 'Review Date',
    Rating,
    REPLACE(ReviewText, '  ', ' ') as 'Review Text'
    FROM [MarketingData].[dbo].[customer_reviews]
    """   


reviews_df = fetch_data_from_sql ('AKSHY-PC', 'MarketingData', query)

reviews_df.head(10)

  df = pd.read_sql_query(query, conn)


Unnamed: 0,Review ID,Customer ID,Product ID,Review Date,Rating,Review Text
0,1,77,18,2023-12-23,3,"Average experience, nothing special."
1,2,80,19,2024-12-25,5,The quality is top-notch.
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper."
4,5,64,2,2023-07-16,3,"Average experience, nothing special."
5,6,81,1,2025-12-21,4,Customer support was very helpful.
6,7,16,1,2024-01-29,3,"Average experience, nothing special."
7,8,55,8,2024-08-15,5,The quality is top-notch.
8,9,3,13,2023-09-01,4,"I love this product, will buy again!"
9,10,78,6,2024-06-17,5,"Excellent product, highly recommend!"


In [9]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = ['Negative', 'Neutral', 'Positive']

def analyze_sentiment(text):
    text = str(text)
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**tokens)
        probs = torch.softmax(outputs.logits, dim=1)[0].cpu().numpy()
    label = labels[probs.argmax()]
    confidence = probs.max()
    return label, confidence
reviews_df[['Sentiment', 'Confidence']] = reviews_df['Review Text'].apply(lambda x: pd.Series(analyze_sentiment(x)))
reviews_df.head(100)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unnamed: 0,Review ID,Customer ID,Product ID,Review Date,Rating,Review Text,Sentiment,Confidence
0,1,77,18,2023-12-23,3,"Average experience, nothing special.",Negative,0.611943
1,2,80,19,2024-12-25,5,The quality is top-notch.,Positive,0.972466
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.,Positive,0.952633
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper.",Positive,0.613606
4,5,64,2,2023-07-16,3,"Average experience, nothing special.",Negative,0.611943
...,...,...,...,...,...,...,...,...
95,96,19,13,2023-09-02,3,"Good quality, but could be cheaper.",Positive,0.613606
96,97,64,6,2024-01-19,3,"The product is okay, but the instructions were...",Neutral,0.558915
97,98,96,3,2025-11-20,5,Exceeded my expectations!,Positive,0.961791
98,99,79,16,2025-01-29,2,"Average experience, nothing special.",Negative,0.611943


In [11]:
# reviews_df.head(100)
def categorize_sentiment(score, rating):
    if score == "Positive":
        if rating >= 4:
            return "Positive"
        
        elif rating == 3:
            return 'Mixed Positive'
        
        else:
            return 'Mixed negative'      
    
    if score == "Negative":
        if rating <= 2:
            return "Negative"
        
        elif rating == 3:
            return 'Mixed Negative'
        
        else:
            return 'Mixed Positive'
    
    else:
        if rating >= 4:
            return 'Mixed Positive'
        
        elif rating == 3:
            return 'Neutral'
        
        else:
            return 'Mixed Negative'
       
reviews_df['Sentiment Category'] = reviews_df.apply(lambda row: categorize_sentiment(row['Sentiment'], row['Rating']), axis=1)
reviews_df.head(100)

# reviews_df.to_csv('Sentiment_Analyzed_Reviews.csv', index=False, encoding='utf-8-sig')


Unnamed: 0,Review ID,Customer ID,Product ID,Review Date,Rating,Review Text,Sentiment,Confidence,Sentiment Category
0,1,77,18,2023-12-23,3,"Average experience, nothing special.",Negative,0.611943,Mixed Negative
1,2,80,19,2024-12-25,5,The quality is top-notch.,Positive,0.972466,Positive
2,3,50,13,2025-01-26,4,Five stars for the quick delivery.,Positive,0.952633,Positive
3,4,78,15,2025-04-21,3,"Good quality, but could be cheaper.",Positive,0.613606,Mixed Positive
4,5,64,2,2023-07-16,3,"Average experience, nothing special.",Negative,0.611943,Mixed Negative
...,...,...,...,...,...,...,...,...,...
95,96,19,13,2023-09-02,3,"Good quality, but could be cheaper.",Positive,0.613606,Mixed Positive
96,97,64,6,2024-01-19,3,"The product is okay, but the instructions were...",Neutral,0.558915,Neutral
97,98,96,3,2025-11-20,5,Exceeded my expectations!,Positive,0.961791,Positive
98,99,79,16,2025-01-29,2,"Average experience, nothing special.",Negative,0.611943,Negative
