### Sentiment Analysis (Flipkart Reviews)

Installing textblob library

In [None]:
!pip install textblob 

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import spacy
nlp = spacy.load("en_core_web_sm")

Loading data

In [None]:
df=pd.read_csv('/kaggle/input/flipkart-reviews-sentiment-analysis/flipkart.csv')

Removing index

In [None]:
df=df[['Review', 'Rating']]
df

Checking for null values

In [None]:
df.isnull().sum()

Data Cleaning

In [None]:
#clean the reviews with a function
def cleanReviews(text):
    text = re.sub('@[A-Za-z0-9_]+', '', text) #removes @mentions
    text = re.sub('#','',text) #removes hastag '#' symbol
    text = re.sub('https?:\/\/\S+', '', text)  #removes https
    text = re.sub('\n',' ',text) #removes new line
    text = re.sub(r'www\S+', " ", text) #removes www
    text = re.sub(r'\.|/|:|-', " ", text)
    text = re.sub(r'[^\w\s]','',text)
    return text
df['cleanedReviews'] = df['Review'].apply(cleanReviews) #apply cleanReviews function to the Reviews
df.head() #compares original reviews with cleaned Reviews

New dataset from cleaned reviews

In [None]:
df1=df[['cleanedReviews','Rating']]
df1

Count of reviews based on rating

In [None]:
x=[5,4,1,3,2]
plt.bar(x,df['Rating'].value_counts())

Creating sentiments based on reviews as Analysis

In [None]:
#create a function to check negative, neutral and positive analysis
def getAnalysis(rating):
    if rating<3:
        return 'Negative'
    elif rating ==3:
        return 'Neutral'
    else:
        return 'Positive'
    
df1['Analysis'] = df1['Rating'].apply(getAnalysis)

df1

Creating sentiments based on subjectivity and polarity as Analysis2

We use a library called __TextBlob__ to detect the subjectivity or polarity of a tweet. 

It uses __Natural Language ToolKit (NLTK)__. 

- __Subjectivity__ shows the amount of personal opinion in a sentence. Its score lies between 0 and 1. If a tweet has high subjectivity i.e. close to 1, it means the tweet contains more of a personal opinion than factual information. 
- The __polarity__ score lies between (-1 to 1) where -1 identifies the most negative words and 1 identifies the most positive words.

In [None]:
# Detect sentiments
# get subjectivity and polarity of reviews with a function

def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
#get polarity with a function
def getPolarity(text):
    return TextBlob(text).sentiment.polarity
df1['Subjectivity'] = df1['cleanedReviews'].apply(getSubjectivity)
df1['Polarity'] = df1['cleanedReviews'].apply(getPolarity)


In [None]:
df1

In [None]:
#create a function to check negative, neutral and positive analysis
def getAnalysis(score):
    if score<0:
        return 'Negative'
    elif score ==0:
        return 'Neutral'
    else:
        return 'Positive'
    
df1['Analysis2'] = df1['Polarity'].apply(getAnalysis)

df1

Comparing Analysis based on polarity-subjectivity and based on ratings

In [None]:
df1['Analysis'].value_counts() #shows the counts of reviews' polarity

In [None]:
df1['Analysis2'].value_counts() #shows the counts of reviews' polarity

Sentiment based on polarity

In [None]:
# Plot a bar graph and pie chart for each polarity

#plot a bar graph to show count of review sentiment
fig = plt.figure(figsize=(7,5))
color = ['green','grey','red']
df1['Analysis2'].value_counts().plot(kind='bar',color = color)
plt.title('Value count of review polarity')
plt.ylabel('Count')
plt.xlabel('Polarity')
plt.grid(False)
plt.show()

Sentiment based on ratings

In [None]:
# Plot a bar graph and pie chart for each rating

#plot a bar graph to show count of review sentiment
fig = plt.figure(figsize=(7,5))
color = ['green','grey','red']
df1['Analysis2'].value_counts().plot(kind='bar',color = color)
plt.title('Value count of review rating')
plt.ylabel('Count')
plt.xlabel('rating')
plt.grid(False)
plt.show()

Based on polarity

In [None]:
#pie chart to show percentage distribution of polarity
fig = plt.figure(figsize=(7,7))
colors = ('green', 'grey', 'red')
wp={'linewidth':2, 'edgecolor': 'black'}
tags=df1['Analysis2'].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors=colors, 
         startangle=90, wedgeprops=wp, explode=explode, label='')
plt.title('Distribution of polarity')

Based on ratings

In [None]:
#pie chart to show percentage distribution of ratings
fig = plt.figure(figsize=(7,7))
colors = ('green', 'grey', 'red')
wp={'linewidth':2, 'edgecolor': 'black'}
tags=df1['Analysis'].value_counts()
explode = (0.1,0.1,0.1)
tags.plot(kind='pie', autopct='%1.1f%%', shadow=True, colors=colors, 
         startangle=90, wedgeprops=wp, explode=explode, label='')
plt.title('Distribution of ratings')

In [None]:
#plot the polarity and subjectivity on a scatter plot
plt.figure(figsize=(9,7))
for i in range(0,df1.shape[0]):
    plt.scatter(df1['Polarity'][i],df1['Subjectivity'][i], color='blue')
plt.title('Sentiment Analysis')
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

#### Create a word cloud function for both ***__positive__*** and ***__negative__*** tweets.

Based on subjectivity-polarity

In [None]:
#create a function for wordcloud
def create_wordcloud(text):    
    allWords = ' '.join([x for x in text])
    wordCloud = WordCloud(background_color='white', width=800, height=500, random_state=21, max_font_size=130).generate(allWords)
    plt.figure(figsize=(20,10))
    plt.imshow(wordCloud)
    plt.axis('off')
    plt.show()
#wordcloud for positive reviews
posReviews = df1.loc[df1['Analysis2']=='Positive', 'cleanedReviews']
create_wordcloud(posReviews)

In [None]:
#wordcloud for negative reviews
negReviews = df1.loc[df1['Analysis2']=='Negative', 'cleanedReviews']
create_wordcloud(negReviews)

#### Create a word cloud function for both ***__positive__*** and ***__negative__*** tweets.

Based on ratings

In [None]:
#wordcloud for positive ratings
posReviews = df1.loc[df1['Analysis']=='Positive', 'cleanedReviews']
create_wordcloud(posReviews)

In [None]:
#wordcloud for negative ratings
negReviews = df1.loc[df1['Analysis']=='Negative', 'cleanedReviews']
create_wordcloud(negReviews)