# project name: Sentiment analysis

# Step 0. Read in Data and NLTK Basics


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import nltk
nltk.download('punkt')
nltk.download('popular')


In [None]:
# Read in data
df = pd.read_csv('D:/Reviews.csv')
print('the shape of all dataset is: ',df.shape)
print("Dataset size:", len(df))
df=df.head(100)
print('the shape of our data is: ' ,df.shape )
print("Dataset size:", len(df))

In [None]:
print('the first 5 review is: ')
print(' ')
df.head()

In [None]:
print('the last 5 review is: ')
print('')
df.tail()

In [None]:
print('the shape of data is: ',df.shape)

print("Dataset size:", len(df))


In [None]:

print('th information of data is:')
print('')
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
df.describe(include="O")

In [None]:
df.describe(include='all')

In [None]:
df.dtypes

# #Data Cleaning

    Check Missing Values

In [None]:
df.isnull().sum()

#  Missing Values

In [None]:
df.isnull().sum()[:20]

In [None]:
nltk.download('stopwords')

from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")


# Quick EDA and visualization


In [None]:
ax = df['Score'].value_counts().sort_index() \
    .plot(kind='bar',
          title='Count of Reviews by Stars',
          figsize=(5, 2.5))
ax.set_xlabel('Review Stars')
plt.show()

In [None]:
df.head()

In [None]:
df.Score.value_counts()

In [None]:
top_ProductId= df.ProductId.value_counts()
top_ProductId.head()

In [None]:
plt.figure(figsize=[10, 5])
sns.barplot(x = top_ProductId.index[:5] , y=top_ProductId.head() )
plt.title('Top 5 top_ProductId');

In [None]:
top_Score= df.UserId.value_counts()
top_Score.head()

In [None]:
plt.figure(figsize=[10, 5])
sns.barplot(x = top_Score.index[:5] , y=top_Score.head() )
plt.title('Top 5 top_Score');

In [None]:
# Drop all 'HelpfulnessNumerator','HelpfulnessDenominator','UserId','ProfileName','Time'

df.drop(['HelpfulnessNumerator','HelpfulnessDenominator','UserId','ProfileName','Time'], axis= 1, inplace= True)

In [None]:
df.head()

#  NLTK


In [None]:
example = df['Text'][5]
print('this example from data:')
print('')
print(example)

In [None]:
tokens=nltk.word_tokenize(example)
tokens[:10]

In [None]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
from nltk import ngrams

sent=(example)
n=2
unigrams=ngrams(sent.split(),n)
for grams in unigrams:
  print(grams)

In [None]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

# Step 1. VADER Seniment Scoring
We will use NLTK's SentimentIntensityAnalyzer to get the neg/neu/pos scores of the text.

This uses a "bag of words" approach:

Stop words are removed

each word is scored and combined to a total

In [None]:
nltk.download('vader_lexicon')
#lexicon is a pre-trained sentiment analysis model included in NLTK.

from nltk.sentiment import SentimentIntensityAnalyzer
# is a sentiment analysis tool based on the VADER lexicon.

from tqdm.notebook import tqdm
#provides a progress bar for iterating over loops

sia = SentimentIntensityAnalyzer()
# This object allows you to perform sentiment analysis on text.

In [None]:
sia.polarity_scores('I am so happy!')

In [None]:
sia.polarity_scores('This is the worst thing ever.')

In [None]:
sia.polarity_scores(example)

In [None]:
# Run the polarity score on the entire dataset
# For each row, it extracts the 'Text' and 'Id' columns' values and performs sentiment analysis
####################################

res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
# tqdm is used to display a progress bar for the loop, indicating the progress of the iteration.

    text = row['Text']
    #The line extracts the value of the 'Text' column from the current row for sentiment analysis

    myid = row['Id']

    res[myid] = sia.polarity_scores(text)
    #This line performs sentiment analysis on the text using the sia.polarity_scores() method
    #The resulting sentiment scores (compound, positive, negative, and neutral)

In [None]:
#performs additional operations on the sentiment scores
###############################################

vaders = pd.DataFrame(res).T
#df by res dictionary name vader
#T transposes the DataFrame to have the sentiment scores as columns and the 'Id' values as rows.

vaders = vaders.reset_index().rename(columns={'index': 'Id'})
#changes the column name from 'index' to 'Id'.

vaders = vaders.merge(df, how='left')
#merge vaders DataFrame and the original DataFrame df.


In [None]:
# Now we have sentiment score and metadata
vaders.head()

# Plot VADER results


In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
# creates a bar plot data is vaders ,x_label is "score",y_label is "compound".

ax.set_title('Compund Score by Amazon Star Review')
#title
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))

#one row and three columns of subplots
#figsize is 12w , 3h
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
#adjusts the spacing between subplots

plt.show()

# Step 3. Roberta Pretrained Model

Use a model trained of a large corpus of data.

Transformer model accounts for the words but also the context related to other words

In [None]:
!pip install transformers

from transformers import AutoTokenizer
#The AutoTokenizer class is used for tokenizing text,

from transformers import AutoModelForSequenceClassification
#his class provides a pre-trained model

from scipy.special import softmax
#The scipy.special module provides a wide range
#of mathematical functions that are not included in the core Python math module.
# Softmax is commonly applied to convert raw scores into probability distributions,

In [None]:
#initializes a tokenizer and a pre-trained model
#####################################################

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
#sentiment analysis model trained on Twitter data using the Roberta architecture.

tokenizer = AutoTokenizer.from_pretrained(MODEL)
#The tokenizer is responsible for converting text input into numerical tokens that can be understood by the model.

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

#This particular class is designed for sequence classification tasks,


In [None]:
# VADER results on example
print(example)
sia.polarity_scores(example)
#perform sentiment analysis on

In [None]:
# Run for Roberta Model
#######################

encoded_text = tokenizer(example, return_tensors='pt')
#The return_tensors='pt' specifies that the tokenizer should return PyTorch tensors as output.

output = model(**encoded_text)
# passes the encoded_text to the pre-trained model for sequence classification.

scores = output[0][0].detach().numpy()
#retrieves the output scores from the model. The model's output is a tensor,
#.detach().numpy() is used to convert the tensor to a NumPy array.

scores = softmax(scores)
#function converts the scores into probabilities

scores_dict = {

    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
   }
print(scores_dict)

In [None]:
#polarity_scores_roberta that takes an example text as input and returns a dictionary of sentiment scores.
#######################################

def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    ##The return_tensors='pt' parameter specifies that the tokenizer should return PyTorch tensors as output.

    output = model(**encoded_text)


    scores = output[0][0].detach().numpy()
    #retrieves the output scores from the model. The model's output is a tensor,
    #.detach().numpy() is used to convert the tensor to a NumPy array.

    scores = softmax(scores)
    ##function converts the scores into probabilities,

    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
        #stores the sentiment scores under different keys.

    }
    return scores_dict

In [None]:
#plies sentiment analysis using both VADER and a pre-trained RoBERTa model to each text in the DataFrame.

res = {}

for i, row in tqdm(df.iterrows(), total=len(df)):
#The tqdm function is used to display a progress bar during the iteration.

    try:
        #starts a try-except block,


        text = row['Text']
        myid = row['Id']
        #res retrieve the text and ID values from the current row of the DataFrame.

        vader_result = sia.polarity_scores(text)
        # applies VADER sentiment analysis using the SentimentIntensityAnalyzer object sia to the text,

        vader_result_rename = {}

        for key, value in vader_result.items():
            # loop over each key-value pair

            vader_result_rename[f"vader_{key}"] = value
            # renames each key in the vader_result


        roberta_result = polarity_scores_roberta(text)
        #passing the text , and assigns the returned sentiment scores dictionary to roberta_result.

        both = {**vader_result_rename, **roberta_result}
        #combines the  results

        res[myid] = both
        #dds  results (both) to the res dictionary, with the ID (myid) as the key.

    except RuntimeError:
        #except block is triggered if a RuntimeError occurs during the sentiment analysis process.

        print(f'Broke for id {myid}')


In [None]:
#takes the sentiment analysis results stored in the res dictionary and merges them with the original DataFrame (df)
# to create a new DataFrame (results_df) that contains the sentiment analysis results alongside the original data

results_df = pd.DataFrame(res).T
#T transposes the DataFrame so that the sentiment analysis results are aligned as columns.

results_df = results_df.reset_index().rename(columns={'index': 'Id'})

results_df = results_df.merge(df, how='left')



# Compare Scores between models


In [None]:
print('the columns of data is : ')
results_df.columns

# Step 3. Combine and compare¶


In [None]:
###Combine and compare
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
             #set to 'Score', which means that the points in the plot will be colored according to the 'Score' variable

             palette='tab10')
             #sets the color palette to 'tab10'.

plt.show()


# Step 4: Review Examples:
        
Positive 1-Star and Negative 5-Star Reviews

Lets look at some examples where the model scoring and review score differ the most.

In [None]:
#Review Examples

results_df.query('Score == 1') \
    .sort_values('roberta_pos', ascending=False)['Text'].values[0]
#filters the DataFrame results_df to include only rows where the 'Score' column is equal to 1.
# This creates a subset of the DataFrame containing positive reviews..

In [None]:
 results_df.query('Score == 1') \
    .sort_values('vader_pos', ascending=False)['Text'].values[0]
# sorts the reviews based on the positivity score predicted by the VADER model,
#['Text'].values[0]: This line selects the 'Text' column from the sorted subset and retrieves the value of the first element.


In [None]:
# negative sentiment 5-Star view
results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('vader_neg', ascending=False)['Text'].values[0]

In [None]:
from transformers import pipeline
#function is a high-level interface for executing various NLP tasks using pre-trained models.

sent_pipeline = pipeline("sentiment-analysis")
    #Sentiment Analysis is a supervised Machine Learning technique that is used to analyze and predict
    #the polarity of sentiments within a text (either positive or negative

In [None]:
print('your Review is  ',sent_pipeline('I love sentiment analysis!'))

In [None]:
print('your Review is  ',sent_pipeline('Make sure to like and subscribe!'))

In [None]:
print('your Review is  ',sent_pipeline('booo'))

In [None]:
print('your Review is  ',sent_pipeline('i care about hotdog'))

In [None]:
print('your Review is  ',sent_pipeline('احب الطعام الذى يحتوى على الخضراوات'))

In [None]:
#spanish
print('your Review is  ',sent_pipeline('no me gusta la comida'))

In [None]:
print('your Review is  ',sent_pipeline('je naime pas la nourriture'))

In [None]:
print('your Review is  ',sent_pipeline('Jaime la nourriture'))

In [None]:
print('your Review is  ',sent_pipeline('/content/Reviews.csv'))

# The End