In [1]:
#Sentiment Analysis also known as opinion, is a techniquu used in natural language processing [NLP] to
# determine the emotional undertone of a document

In [2]:
# Data Manipulation & Math
import numpy as np
import pandas as pd

# Natural Language Toolkit for text processing
import nltk    
# VADER sentiment analysis tool (positive/negative/neutral scores)
from nltk.sentiment.vader import SentimentIntensityAnalyzer   

# Regular expressions for pattern matching in text
import re 

# Simple text processing library (sentiment, noun phrases, etc.)
from textblob import TextBlob  

# Creates visual representations of text data where bigger words = more frequent
from wordcloud import WordCloud 

# Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Connects pandas with plotly for interactive plots
import cufflinks as cf 

# Displays plots directly in Jupyter notebook
%matplotlib inline  

# Enables offline plotting without internet
from plotly.offline import init_notebook_mode, iplot 

# Sets up plotly for Jupyter notebooks
init_notebook_mode(connected = True) 

# Makes cufflinks work offline
cf.go_offline()

# Creates interactive plot objects (bars, lines, etc.)
import plotly.graph_objs as go  

# Creates multiple plots in one figure
from plotly.subplots import make_subplots 

# Controls warning messages - FIXED TYPO
import warnings 

# Suppresses all warning messages
warnings.filterwarnings("ignore") 

# Shows ALL columns in pandas DataFrames (no truncation)
pd.set_option('display.max_columns', None) 

print("✅ All libraries imported successfully!")
print("🚀 Ready for sentiment analysis!")

✅ All libraries imported successfully!
🚀 Ready for sentiment analysis!


In [3]:
df = pd.read_csv("Reviews.csv")

In [4]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
df.shape

(568454, 10)

In [7]:
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [8]:

# Function to analyze missing values
def missing_values_analysis(df):
    na_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (df[na_columns].isnull().sum() / df.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['Missing Values', 'Ratio (%)'])
    return missing_df

# Function to check dataframe info
def check_dataframe(df, head=5, tail=5):
    print("SHAPE".center(82, '-'))
    print(f"Rows: {df.shape[0]}")
    print(f"Columns: {df.shape[1]}")

    print("TYPES".center(82, '-'))
    print(df.dtypes)

    print("MISSING VALUES".center(82, '-'))
    print(missing_values_analysis(df))

    print("DUPLICATED VALUES".center(82, '-'))
    print(df.duplicated().sum())

    print("QUANTILES".center(82, '-'))
    numeric_df = df.select_dtypes(include=['number'])  # ⬅️ Only numeric columns
    print(numeric_df.quantile([0, 0.05, 0.5, 0.95, 0.99, 1]).T)

# Example usage
check_dataframe(df)


--------------------------------------SHAPE---------------------------------------
Rows: 568454
Columns: 10
--------------------------------------TYPES---------------------------------------
Id                         int64
ProductId                 object
UserId                    object
ProfileName               object
HelpfulnessNumerator       int64
HelpfulnessDenominator     int64
Score                      int64
Time                       int64
Summary                   object
Text                      object
dtype: object
----------------------------------MISSING VALUES----------------------------------
             Missing Values  Ratio (%)
Summary                  27        0.0
ProfileName              26        0.0
--------------------------------DUPLICATED VALUES---------------------------------
0
------------------------------------QUANTILES-------------------------------------
                               0.00          0.05          0.50          0.95  \
Id              

In [9]:
def check_class(dataframe):
    nunique_df = pd.DataFrame({
        'Variable': dataframe.columns,
        'Classes': [dataframe[i].nunique() for i in dataframe.columns]
    })
    
    nunique_df = nunique_df.sort_values('Classes', ascending=False)
    nunique_df = nunique_df.reset_index(drop=True)
    
    return nunique_df

# Example usage
check_class(df)


Unnamed: 0,Variable,Classes
0,Id,568454
1,Text,393579
2,Summary,295742
3,UserId,256059
4,ProfileName,218415
5,ProductId,74258
6,Time,3168
7,HelpfulnessDenominator,234
8,HelpfulnessNumerator,231
9,Score,5


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Pretty color palette 💅
constraints = ['#B34D72', '#BEBDC3', '#1F80BC', '#C962E8', '#BEBDC5']

def categorical_variable_summary(df, column_name):
    if column_name not in df.columns:
        print(f"❌ Column '{column_name}' not found in the DataFrame!")
        print(f"✅ Available columns: {list(df.columns)}")
        return

    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Countplot', 'Percentage'),
        specs=[[{'type': 'xy'}, {'type': 'domain'}]]
    )

    # Bar chart
    fig.add_trace(
        go.Bar(
            y=df[column_name].value_counts().values.tolist(),
            x=[str(i) for i in df[column_name].value_counts().index],
            text=df[column_name].value_counts().values.tolist(),
            textfont=dict(size=14),
            name=column_name,
            textposition='auto',
            showlegend=False,
            marker=dict(
                color=constraints,
                line=dict(color='#0B6E6C', width=1)
            )
        ),
        row=1, col=1
    )

    # Pie chart
    fig.add_trace(
        go.Pie(
            labels=df[column_name].value_counts().keys(),
            values=df[column_name].value_counts().values,
            textfont=dict(size=18),
            textposition='auto',
            name=column_name,
            showlegend=False,
            marker=dict(colors=constraints)
        ),
        row=1, col=2
    )

    # Layout without xanchor/yanchor errors
    fig.update_layout(
        title=dict(
            text=f"✨✨✨✨ Distribution of {column_name} ✨✨✨✨",
            x=0.5,  # center the title
            xanchor='center'  # valid inside title dict
        ),
        template='plotly_white'
    )

    fig.show()


In [11]:
categorical_variable_summary(df, 'Score')  # Replace 'Score' with your column


In [12]:
# Show all column names
print(df.columns.tolist())

['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text']


In [13]:
df.Text.head()

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: Text, dtype: object

In [14]:
df.Summary.head()

0    Good Quality Dog Food
1        Not as Advertised
2    "Delight" says it all
3           Cough Medicine
4              Great taffy
Name: Summary, dtype: object

In [15]:
text_example = df.Text[20231]
text_example

'Product was exactly as described, and shipping was insanely reasonable.  Fit the bill exactly, and very reasonably priced.'

In [16]:
summary_example = df.Summary[20231]
summary_example

'Excellent Store!'

In [17]:
cleaned_text = re.sub("[^a-zA-Z]", " ", text_example)
print(cleaned_text)

Product was exactly as described  and shipping was insanely reasonable   Fit the bill exactly  and very reasonably priced 


In [18]:
cleaned_text = text_example.lower().split()
cleaned_text

['product',
 'was',
 'exactly',
 'as',
 'described,',
 'and',
 'shipping',
 'was',
 'insanely',
 'reasonable.',
 'fit',
 'the',
 'bill',
 'exactly,',
 'and',
 'very',
 'reasonably',
 'priced.']

In [19]:
rt = lambda x: re.sub("[^a-zA-Z]", " ", str(x))
df["Text"] = df["Text"].map(rt)
df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arrived labeled as jumbo salted peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",this is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,if you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,great taffy at a great price there was a wid...


In [None]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Clean the text column
clean_text = lambda x: re.sub("[^a-zA-Z]", " ", str(x))
df['Text'] = df['Text'].map(clean_text)
df['Text'] = df['Text'].str.lower()


df[['polarity', 'subjectivity']] = df['Text'].apply(
    lambda Text: pd.Series(TextBlob(Text).sentiment)
)


analyzer = SentimentIntensityAnalyzer()

# Apply sentiment classification
for index, row in df['Text'].items():
    score = analyzer.polarity_scores(row)
    neg = score['neg']
    pos = score['pos']

    if neg > pos:
        df.loc[index, 'sentiment'] = 'Negative'
    elif pos > neg:
        df.loc[index, 'sentiment'] = 'Positive'
    else:
        df.loc[index, 'sentiment'] = 'Neutral'


top_positive = df[df["sentiment"] == "Positive"].sort_values(
    "wilson_lower_bound", ascending=False
).head(10)

# Display
from IPython.display import display
display(top_positive)

constraints = ['#B34D72', '#BEBDC3', '#1F80BC', '#C962E8', '#BEBDC5']

def categorical_variable_summary(df, column_name):
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Countplot', 'Percentage'),
        specs=[[{'type': 'xy'}, {'type': 'domain'}]]
    )

    value_counts = df[column_name].value_counts()

    fig.add_trace(
        go.Bar(
            y=value_counts.values,
            x=value_counts.index.astype(str),
            text=value_counts.values,
            textposition='auto',
            marker=dict(
                color=constraints,
                line=dict(color='#0B6E6C', width=1)
            ),
            showlegend=False
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Pie(
            labels=value_counts.index,
            values=value_counts.values,
            marker=dict(colors=constraints),
            showlegend=False
        ),
        row=1, col=2
    )

    fig.update_layout(
        title_text=f"Distribution of {column_name}",
        title_x=0.5,
        template='plotly_white'
    )

    fig.show()

# Call the function
categorical_variable_summary(df, 'sentiment')


In [None]:

# Top 10 most positively rated rows using wilson_lower_bound
df[df["sentiment"] == "Positive"].sort_values("wilson_lower_bound", ascending=False).head(10)