## **I. Text cleaning**
We clean the reviews by removing unnecessary details in the reviews. We then remove the stop words from the data to retain important words in the reviews - Use of stop words list customized for this analysis We also customize the positive and negative word list to get an exhaustive list of sentiment words

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 1: Importing libraries and reading the data

In [None]:
import pandas as pd
import numpy as np

In [None]:
#reading the merged data --> phone data + metadata
phone_data = pd.read_csv('/content/drive/My Drive/sentiment_analysis/phone_reviews.csv')

In [None]:
# Reading the phone reviews text into a list

phone_review = phone_data['reviewText']

In [None]:
phone_data['reviewText'].isnull().sum()

0

# Step 2: Removing punctuations/special characters

In [None]:
#Creating a list 'cleaned_review' to store all reviews with punctuations removed

import re

cleaned_review = []
for review in range(0, len(phone_review)):  
    # Remove all the special characters
    processed_review = re.sub(r'[^\w\d\'\s]+', ' ', str(phone_review[review]))
 
    # remove all single characters
    processed_review = re.sub(r'\s+[a-zA-Z]\s+\n', ' ', processed_review)
 
    # Remove single characters from the start
    processed_review = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_review) 
 
    # Substituting multiple spaces with single space
    processed_review= re.sub(r'\s+', ' ', processed_review, flags=re.I)
 
    # Removing prefixed 'b'
    processed_review = re.sub(r'^b\s+', '', processed_review)
 
    # Converting to Lowercase
    processed_review = processed_review.lower()
 
    cleaned_review.append(processed_review)
    
len(cleaned_review)

NameError: ignored

# Step 3: Removing stopwords

In [None]:
# Reading stop words from a text file in to a list
stop_words = [line.rstrip('\n') for line in open('/content/drive/My Drive/sentiment_analysis/stop_words_long.txt')]
len(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Removing stop words from all reviews

no_stopwords_review = []  #the list which holds the final cleaned review

for review in cleaned_review:
    word_tokens = review.split() #Tokenization
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    #Appending the list of filtered words
    no_stopwords_review.append(' '.join(filtered_sentence))



NameError: ignored

In [None]:
# Adding the cleaned reviews to the data frame phone_data

phone_data['cleaned_reviewText'] = no_stopwords_review

In [None]:
# Printing the data type of the columns
phone_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62091 entries, 0 to 62090
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   overall             62091 non-null  float64
 1   verified            62091 non-null  bool   
 2   reviewerID          62091 non-null  object 
 3   asin                62091 non-null  object 
 4   reviewText          62091 non-null  object 
 5   asin.1              62091 non-null  object 
 6   unixReviewTime      62091 non-null  int64  
 7   category            62091 non-null  object 
 8   also_view           62091 non-null  object 
 9   brand               62025 non-null  object 
 10  price               28382 non-null  object 
 11  review_sentiment    62091 non-null  object 
 12  cleaned_reviewText  62091 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(10)
memory usage: 5.7+ MB


In [None]:
# Cleaned reviews are of object type which means there are some blank reviews after cleaning. To remove that,
phone_data=phone_data.loc[~(phone_data['cleaned_reviewText']=='')]

In [None]:
phone_data.shape

(61946, 13)

In [None]:
# Writing the dataframe phone_data into a csv file
phone_data.to_csv("/content/drive/My Drive/sentiment_analysis/review_sentiment.csv",index = False)

## **II. Feature Extraction**

# Step 1: Importing libraries and reading the data


In [None]:
# Importing libraries and reading the data
import pandas as pd
phone_data = pd.read_csv("/content/drive/My Drive/sentiment_analysis/review_sentiment.csv")



In [None]:
phone_data.columns

Index(['overall', 'verified', 'reviewerID', 'asin', 'reviewText', 'asin.1',
       'unixReviewTime', 'category', 'also_view', 'brand', 'price',
       'review_sentiment', 'cleaned_reviewText'],
      dtype='object')

In [None]:
#Extracting only the required columns
phone_data_col = phone_data[['overall', 'verified', 'unixReviewTime', 'reviewerID',
       'asin', 'reviewText', 'category', 'also_view','brand', 'price', 'cleaned_reviewText','review_sentiment']]

# Step 2: Converting Unix Review Time to Date-Time format

In [None]:
#Transforming unixReview time to date time format
from datetime import datetime, timedelta
phone_data_col['Date&Time'] = phone_data_col['unixReviewTime'].apply(lambda d: (datetime.fromtimestamp(d) - timedelta(hours=2)).strftime('%Y-%m-%d'))

In [None]:
phone_data_col.head()

Unnamed: 0,overall,verified,unixReviewTime,reviewerID,asin,reviewText,category,also_view,brand,price,cleaned_reviewText,review_sentiment,Date&Time
0,1.0,False,1095724800,A5JLAU2ARJ0BO,B0000E3GWH,This phone is ugly and heavy and has a terribl...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone ugly heavy terrible user interface techi...,NEGATIVE,2004-09-20
1,4.0,False,1090627200,A1ACM1CBGORBN1,B0000E3GWH,I had the Samsung V205 and then I decided to t...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung v205 decided try e715 fyi people batte...,POSITIVE,2004-07-23
2,3.0,False,1089072000,A2V48Q03FZUOSD,B0000E3GWH,This is the first Samsung phone I have had and...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung phone not measure v300 am tmobile cust...,NEGATIVE,2004-07-05
3,5.0,True,1081728000,A1V3TRGWOMA8LC,B0000E3GWH,"..and I'm from Europe, where the phones and ne...","['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,i'm europe phones networks better samsung curr...,POSITIVE,2004-04-11
4,3.0,False,1077148800,A3NOBH42C7UI5M,B0000E3GWH,This phone is amazingly small and light for a ...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone amazingly light camera phone flashlight ...,POSITIVE,2004-02-18


# Step 3: Finding the "also viewed" phones and their brands for each of the phones

In [None]:
import ast
import sys

# In the 'also_view' column which contains the asin numbers of also viewed products, only the asins which are present in metadata
# are extracted and stored in another column

# Function that takes intersection of also viewed asins in phone data and the asins in meta data
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 


#Using Brands and Asins to get the also viewed asins which are mobile phones

data_brand_asin = pd.read_csv('/content/drive/My Drive/sentiment_analysis/Brands and Asins.csv')
all_asins = data_brand_asin['asin']


# Empty list that will store the intersected asins 

all_intersected = []

# Looping through the 'also_view' column for each of the entry, calling the intersection function to get the final list of also_viewed asins.
for also_viewed in range(len(phone_data_col['also_view'])):       
    also_viewed_lst = ast.literal_eval(phone_data_col['also_view'][also_viewed])
    intersected = intersection(all_asins, also_viewed_lst)
    all_intersected.append(intersected)
phone_data_col = phone_data_col.copy()
phone_data_col["also_viewed_phones"] = all_intersected

In [None]:
# Get brands of the also viewed asins

dictionary_of_brands_asins = data_brand_asin.set_index('asin')['brand'].to_dict()
also_viewed_brand = []
for also_viewed_phones in phone_data_col["also_viewed_phones"]:
    also_viewed_brand.append([dictionary_of_brands_asins.get(e,'') for e in also_viewed_phones])
phone_data_col = phone_data_col.copy()
phone_data_col['also_viewed_brand'] = also_viewed_brand

In [None]:
phone_data_col.head()

Unnamed: 0,overall,verified,unixReviewTime,reviewerID,asin,reviewText,category,also_view,brand,price,cleaned_reviewText,review_sentiment,Date&Time,also_viewed_phones,also_viewed_brand
0,1.0,False,1095724800,A5JLAU2ARJ0BO,B0000E3GWH,This phone is ugly and heavy and has a terribl...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone ugly heavy terrible user interface techi...,NEGATIVE,2004-09-20,[],[]
1,4.0,False,1090627200,A1ACM1CBGORBN1,B0000E3GWH,I had the Samsung V205 and then I decided to t...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung v205 decided try e715 fyi people batte...,POSITIVE,2004-07-23,[],[]
2,3.0,False,1089072000,A2V48Q03FZUOSD,B0000E3GWH,This is the first Samsung phone I have had and...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung phone not measure v300 am tmobile cust...,NEGATIVE,2004-07-05,[],[]
3,5.0,True,1081728000,A1V3TRGWOMA8LC,B0000E3GWH,"..and I'm from Europe, where the phones and ne...","['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,i'm europe phones networks better samsung curr...,POSITIVE,2004-04-11,[],[]
4,3.0,False,1077148800,A3NOBH42C7UI5M,B0000E3GWH,This phone is amazingly small and light for a ...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone amazingly light camera phone flashlight ...,POSITIVE,2004-02-18,[],[]


In [None]:
#save the phone_data_col dataframe having the also viewed brands to a csv 
phone_data_col.to_csv("/content/drive/My Drive/sentiment_analysis/phone_data_final.csv")

# Step 4: Getting the number of reviews for each brand

In [None]:
#Getting review count of each brand
dict_brands=phone_data_col['brand'].value_counts().to_dict()
dict_brands

{'0': 5,
 'A Phone': 7,
 'ASUS': 17,
 'AT&T': 189,
 'AXCELLE': 14,
 'Acer': 9,
 'Aeku': 13,
 'Alcatel': 506,
 'Amar': 5,
 'AmericanPumpkins.com': 6,
 'Apple': 3639,
 'Asus': 786,
 'BLU': 9144,
 'Best Deal USA 2013': 8,
 'BlackBerry': 3643,
 'CT-Miami LLC': 134,
 'CUBOT': 32,
 'Casio': 77,
 'Caterpillar': 48,
 'Cell Phone': 6,
 'Cell.': 6,
 'China': 6,
 'Coolpad': 10,
 'CovertSafe': 19,
 'DOOGEE': 54,
 'Dell': 39,
 'Diswoe': 5,
 'Doro': 5,
 'E-Passion': 5,
 'ECOOPRO': 23,
 'F FORITO': 24,
 'FIGO': 23,
 'Firefly': 16,
 'FreedomPop': 18,
 'Fsmart': 18,
 'GB': 10,
 'Generic': 33,
 'GlocalMe': 10,
 'Google': 60,
 'GreatCall': 99,
 'HAWEEL': 12,
 'HP': 25,
 'HTC': 2425,
 'Hipipooo': 5,
 'Hot Global': 6,
 'Huawei': 1393,
 'IRULU': 17,
 'JIAKE': 35,
 'JUST5': 11,
 'Jabra': 31,
 'Jethro': 13,
 'Jitterbug': 27,
 'KATA': 5,
 'KINGWELL': 19,
 'Kata': 15,
 'Kocaso': 9,
 'Kyocera': 267,
 'LAUDE': 14,
 'LG': 6513,
 'LG Electronics': 17,
 'LGIC': 366,
 'LSoug': 10,
 'Leagoo': 6,
 'Lenovo': 252,
 'Ligh

In [None]:
#club reviews which are less than 1000 reviews

dict_brands_filtered = {}
sum = 0
for key, value in dict_brands.items():
    if value > 1000:
        dict_brands_filtered[key] = value
    else:
        sum  = sum + value
dict_brands_filtered['Others'] = sum  
dict_brands_filtered  

{'Apple': 3639,
 'BLU': 9144,
 'BlackBerry': 3643,
 'HTC': 2425,
 'Huawei': 1393,
 'LG': 6513,
 'Motorola': 4794,
 'Nokia': 4377,
 'Others': 8459,
 'Samsung': 15948,
 'Sony': 1545}

In [None]:
#getting data for only those top brands which have more than 1000 reviews -- comes to 10 brands

top_10_brands = ['Samsung',
 'Motorola',
 'LG',
 'BlackBerry',
 'Nokia',
 'Apple',
 'HTC',
 'BLU',
 'Huawei',
 'Sony']
phone_data_top_10 = phone_data_col[phone_data_col.brand.isin(top_10_brands)]
phone_data_top_10 = phone_data_top_10.reset_index()

In [None]:
phone_data_top_10.head()

Unnamed: 0,index,overall,verified,unixReviewTime,reviewerID,asin,reviewText,category,also_view,brand,price,cleaned_reviewText,review_sentiment,Date&Time,also_viewed_phones,also_viewed_brand
0,0,1.0,False,1095724800,A5JLAU2ARJ0BO,B0000E3GWH,This phone is ugly and heavy and has a terribl...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone ugly heavy terrible user interface techi...,NEGATIVE,2004-09-20,[],[]
1,1,4.0,False,1090627200,A1ACM1CBGORBN1,B0000E3GWH,I had the Samsung V205 and then I decided to t...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung v205 decided try e715 fyi people batte...,POSITIVE,2004-07-23,[],[]
2,2,3.0,False,1089072000,A2V48Q03FZUOSD,B0000E3GWH,This is the first Samsung phone I have had and...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,samsung phone not measure v300 am tmobile cust...,NEGATIVE,2004-07-05,[],[]
3,3,5.0,True,1081728000,A1V3TRGWOMA8LC,B0000E3GWH,"..and I'm from Europe, where the phones and ne...","['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,i'm europe phones networks better samsung curr...,POSITIVE,2004-04-11,[],[]
4,4,3.0,False,1077148800,A3NOBH42C7UI5M,B0000E3GWH,This phone is amazingly small and light for a ...,"['Cell Phones & Accessories', 'Cell Phones', '...",[],Samsung,,phone amazingly light camera phone flashlight ...,POSITIVE,2004-02-18,[],[]


In [None]:
#getting count of also viewed for each brand for top 10 brands
#for every purchased brand get the count of also viewed brand
brand =  phone_data_top_10['brand']

also_viewed_brands = phone_data_top_10['also_viewed_brand']
dict_also_viewed_brands = {}

for i in range(len(brand)):
    if brand[i] not in dict_also_viewed_brands:
        dict_also_viewed_brands[brand[i]] = {}
    for j in range(len(also_viewed_brands[i])):
        if also_viewed_brands[i][j] not in dict_also_viewed_brands[brand[i]]:
            dict_also_viewed_brands[brand[i]][also_viewed_brands[i][j]] = 1
        else:
            dict_also_viewed_brands[brand[i]][also_viewed_brands[i][j]] = int(dict_also_viewed_brands[brand[i]][also_viewed_brands[i][j]]) + 1                  

In [None]:
dict_also_viewed_brands

{'Apple': {'Apple': 14018,
  'LG': 28,
  'Straight Talk': 12,
  'Sudroid': 241,
  'ZTE': 12},
 'BLU': {'Alcatel': 282,
  'Alcatel One Touch': 956,
  'Apple': 61,
  'BLU': 8016,
  'BlackBerry': 270,
  'Casio': 48,
  'Caterpillar': 164,
  'GreatCall': 216,
  'HTC': 132,
  'Huawei': 10,
  'Kocaso': 38,
  'Kyocera': 151,
  'LG': 963,
  'Microsoft': 331,
  'Motorola': 978,
  'Nokia': 1022,
  'OnePlus': 18,
  'Pantech': 264,
  'RCA': 300,
  'Samsung': 1129,
  'Sony': 650,
  'Tracfone': 7,
  'ZTE': 2146,
  'ZTE USA': 93,
  nan: 48},
 'BlackBerry': {'Alcatel': 21,
  'Alcatel One Touch': 6,
  'BLU': 691,
  'BlackBerry': 45225,
  'Blackberry': 8,
  'Casio': 163,
  'Caterpillar': 63,
  'Fsmart': 96,
  'GreatCall': 108,
  'HTC': 86,
  'Kyocera': 485,
  'LG': 4551,
  'Motorola': 697,
  'Nokia': 198,
  'Palm': 6,
  'Pantech': 1158,
  'Porsche Design': 401,
  'RCA': 9,
  'RugGear': 46,
  'Samsung': 1141,
  'Sanyo': 16,
  'Sony': 47,
  'Storm': 171,
  'Unknown': 147,
  'Verizon': 15,
  'ZTE': 3428,
  

In [None]:
df_viewed_brands = pd.DataFrame(dict_also_viewed_brands).T

# df_viewed_brands.to_excel("viewed_brands.xlsx", index = True)

In [None]:
df_viewed_brands

Unnamed: 0,Samsung,BlackBerry,Pantech,ZTE,Motorola,LG,ZTE USA,BLU,Kyocera,GreatCall,Sanyo,Tracfone,LGIC,Tracfone Wireless,Alcatel,Tracone,"TRACFONE WIRELESS, INC.",TracFone,Caterpillar,RugGear,Fsmart,Nokia,AT&T,NaN,HTC,Alcatel One Touch,Casio,Palm,Sudroid,RCA,Sony,Virgin Mobile,Unknown,MOTCB,Boost,Asus,T-Mobile,NET10,Huawei,Straight Talk,Google,Microsoft,Boost Mobile,Samsung Group,Galaxy S5,CAT PHONES,Samsung Korea,Jitterbug,Apple,TNSO,inDigi,Samsung/Straight Talk,AMGOO Telecom,Verizon,NEC,Droid Turbo,Moto X,Net10,OnePlus,SoonerSoft Electronics,Cell.,Risio,Snapfon,Porsche Design,Storm,Blackberry,Lynxx,Sharp,Cell Phone,Kocaso,Nexus,Sony Ericsson,SKY Devices
Samsung,132136.0,1880.0,1414.0,4107.0,5932.0,14077.0,269.0,660.0,2614.0,460.0,592.0,463.0,171.0,28.0,451.0,43.0,71.0,29.0,51.0,157.0,26.0,1112.0,173.0,308.0,2061.0,468.0,368.0,15.0,24.0,339.0,902.0,9.0,39.0,474.0,203.0,877.0,85.0,9.0,163.0,8.0,21.0,299.0,20.0,1232.0,1527.0,11.0,746.0,27.0,120.0,289.0,83.0,6.0,,,,,,,,,,,,,,,,,,,,,
Motorola,4628.0,1200.0,446.0,1455.0,23248.0,4906.0,82.0,301.0,1879.0,763.0,14.0,253.0,15.0,,24.0,45.0,53.0,22.0,20.0,7.0,71.0,672.0,19.0,671.0,1546.0,130.0,429.0,,,97.0,519.0,,21.0,101.0,7.0,9.0,,,,,,,7.0,,,,,,136.0,,,,20.0,7.0,7.0,111.0,22.0,,,,,,,,,,,,,,,,
LG,9109.0,1050.0,740.0,2043.0,3017.0,34557.0,167.0,683.0,2215.0,300.0,599.0,2928.0,861.0,46.0,269.0,401.0,780.0,54.0,95.0,201.0,11.0,347.0,20.0,163.0,2109.0,60.0,318.0,80.0,,347.0,586.0,,108.0,372.0,100.0,366.0,,7.0,81.0,,223.0,92.0,,,,,,,,,,,,6.0,,,,10.0,28.0,44.0,5.0,5.0,27.0,,,,,,,,,,
BlackBerry,1141.0,45225.0,1158.0,3428.0,697.0,4551.0,109.0,691.0,485.0,108.0,16.0,,,,21.0,,,,63.0,46.0,96.0,198.0,,28.0,86.0,6.0,163.0,6.0,,9.0,47.0,,147.0,,,,,,,,,,,,,,,,,,,,,15.0,,,,,,,,,,401.0,171.0,8.0,,,,,,,
Nokia,1771.0,3791.0,653.0,1648.0,506.0,2537.0,157.0,1302.0,453.0,151.0,138.0,,13.0,55.0,44.0,,,,143.0,,90.0,23121.0,16.0,118.0,307.0,100.0,24.0,,6.0,401.0,92.0,,,46.0,,12.0,11.0,,,,,3112.0,,,,,,,46.0,,,,,,,,,,23.0,,,,,15.0,,,,,,,,,
Apple,,,,12.0,,28.0,,,,,,,,,,,,,,,,,,,,,,,241.0,,,,,,,,,,,12.0,,,,,,,,,14018.0,,,,,,,,,,,,,,,,,,,,,,,,
HTC,2858.0,709.0,227.0,751.0,1592.0,3453.0,,189.0,641.0,65.0,19.0,,,,98.0,,,,,84.0,,332.0,72.0,9.0,12874.0,52.0,,,,175.0,193.0,,,,,72.0,,,,,,98.0,,,,,,,17.0,,,,,,11.0,,,,18.0,,,,,,,,5.0,47.0,14.0,,,,
BLU,1129.0,270.0,264.0,2146.0,978.0,963.0,93.0,8016.0,151.0,216.0,,7.0,,,282.0,,,,164.0,,,1022.0,,48.0,132.0,956.0,48.0,,,300.0,650.0,,,,,,,,10.0,,,331.0,,,,,,,61.0,,,,,,,,,,18.0,,,,,,,,,,,38.0,,,
Huawei,53.0,,,77.0,906.0,586.0,,10.0,17.0,5.0,5.0,18.0,6.0,,12.0,,,,,,,20.0,,,6.0,11.0,,,,,91.0,,,,,,,,2170.0,,,,,,,,,,,,,,,,,,,,27.0,,,,,,,,,,,,26.0,,
Sony,304.0,183.0,10.0,36.0,235.0,240.0,,10.0,10.0,,,,,,,,,,,7.0,,22.0,,11.0,74.0,26.0,,,,,6486.0,,,,,,,,34.0,,,,,,,78.0,,,,5.0,,,,,,,,,,,,,,,,,,,,,,83.0,18.0


In [None]:
#to get the max viewed for each top brand apart from itself
max_viewed = {}
for brand, sub_brand in dict_also_viewed_brands.items():
    also_viewed_brand = ''
    count = 0
    for top_brand, value in sub_brand.items():
        if value > count and brand != top_brand:
            count = value
            also_viewed_brand = top_brand
    max_viewed[brand]  = str(also_viewed_brand) + ' ' + str(count)
max_viewed_df = pd.DataFrame(max_viewed, index = ['top_viewed']).T
max_viewed_df.to_excel("/content/drive/My Drive/sentiment_analysis/Max Viewed.xlsx", index = True)

In [None]:
max_viewed_df

Unnamed: 0,top_viewed
Samsung,LG 14077
Motorola,LG 4906
LG,Samsung 9109
BlackBerry,LG 4551
Nokia,BlackBerry 3791
Apple,Sudroid 241
HTC,LG 3453
BLU,ZTE 2146
Huawei,Motorola 906
Sony,Samsung 304


# Step 5: Extracting the features from positive and negative reviews

In [None]:
def computeReviewTFDict(review):
    """ Returns a tf dictionary for each review whose keys are all
    the unique words in the review and whose values are their
    corresponding tf.
    """
    # Counts the number of times the word appears in review
    reviewTFDict = {}
    for word in review:
        if word in reviewTFDict:
            reviewTFDict[word] += 1
        else:
            reviewTFDict[word] = 1
    # Computes tf for each word -- (normalize it by dividing it with the length of review)
    for word in reviewTFDict:
        reviewTFDict[word] = reviewTFDict[word] / len(review)
    return reviewTFDict

In [None]:
#Reading positive words
pos_words = [line.rstrip('\n') for line in open('/content/drive/My Drive/sentiment_analysis/pos_words.txt')]
print(len(pos_words))

#Reading negative words
neg_words = [line.rstrip('\n') for line in open('/content/drive/My Drive/sentiment_analysis/neg_words.txt')]
print(len(neg_words))

2006
4804


In [None]:
#for getting features removing even the positive and negative words
review_features = []

for review in phone_data_col['cleaned_reviewText']:
    if not isinstance(review, float):
        word_tokens = review.split(' ') 
        filtered_sentence = [w for w in word_tokens if not w in pos_words and not w in neg_words] 
        review_features.append(' '.join(filtered_sentence))

In [None]:
tfDict = []
for review in review_features:
    word_list = review.split()
    tfDict.append(computeReviewTFDict(word_list))

In [None]:
def computeCountDict():
    """ Returns a dictionary whose keys are all the unique words in
    the dataset and whose values count the number of reviews in which
    the word appears.
    """
    countDict = {}
    # Run through each review's tf dictionary and increment countDict's (word, doc) pair
    for review in tfDict:
        for word in review:
            if word in countDict:
                countDict[word] += 1
            else:
                countDict[word] = 1
    return countDict

# Stores the review count dictionary
countDict = computeCountDict()
countDict


{'phone': 32164,
 'heavy': 1168,
 'user': 2075,
 'interface': 942,
 'techies': 18,
 'drops': 378,
 'calls': 2727,
 'manhattan': 6,
 'moto': 947,
 'v600': 8,
 'takes': 2105,
 'pictures': 2302,
 'reception': 960,
 'mobile': 3216,
 'stay': 590,
 'samsung': 4378,
 'v205': 2,
 'decided': 1064,
 'try': 1608,
 'e715': 3,
 'fyi': 65,
 'people': 2588,
 'battery': 9030,
 'batteries': 455,
 'completely': 943,
 'charge': 2534,
 'building': 92,
 'life': 5226,
 'doing': 986,
 'times': 2132,
 'immensely': 12,
 "i've": 3905,
 'customer': 1114,
 '1': 3886,
 'service': 2821,
 '2': 5678,
 'rate': 450,
 'plans': 333,
 '3': 4574,
 'additional': 431,
 'features': 3634,
 'little': 3837,
 'gadget': 81,
 'caller': 115,
 'id': 292,
 'pics': 544,
 'able': 2463,
 'external': 487,
 'display': 2106,
 'flip': 695,
 "you'll": 880,
 "person's": 24,
 'camera': 7070,
 'flash': 1196,
 'zoom': 304,
 'gray': 81,
 'black': 996,
 'white': 716,
 'sepia': 9,
 'sketch': 9,
 'multi': 310,
 'shot': 279,
 'lots': 737,
 'frames': 4

In [None]:
word_cloud_lst = sorted(countDict.items(), key=lambda x: x[1], reverse=True)
word_cloud_lst

[('phone', 32164),
 ('screen', 9874),
 ('battery', 9030),
 ('phones', 7318),
 ('camera', 7070),
 ('price', 6968),
 ('time', 6516),
 ('2', 5678),
 ('android', 5468),
 ('bought', 5420),
 ('apps', 5369),
 ('quality', 5314),
 ('got', 5250),
 ('life', 5226),
 ('am', 5070),
 ('product', 5048),
 ('buy', 5005),
 ('5', 4914),
 ('card', 4761),
 ("i'm", 4716),
 ('using', 4662),
 ('3', 4574),
 ('4', 4482),
 ('samsung', 4378),
 ("i've", 3905),
 ('1', 3886),
 ('little', 3837),
 ('day', 3797),
 ('features', 3634),
 ('iphone', 3567),
 ('sim', 3469),
 ('lot', 3444),
 ('size', 3232),
 ('device', 3223),
 ('mobile', 3216),
 ('galaxy', 3015),
 ('days', 2953),
 ('call', 2916),
 ('update', 2901),
 ('amazon', 2890),
 ('app', 2840),
 ('service', 2821),
 ('calls', 2727),
 ('getting', 2602),
 ('people', 2588),
 ('months', 2583),
 ('memory', 2563),
 ('charge', 2534),
 ('looking', 2532),
 ('unlocked', 2509),
 ('6', 2485),
 ('able', 2463),
 ('data', 2457),
 ('ok', 2436),
 ('sd', 2423),
 ('smartphone', 2403),
 ('bit

In [None]:
#positive corpus
positive_corpus = phone_data_top_10[phone_data_top_10.review_sentiment == 'POSITIVE']
positive_corpus = positive_corpus[["cleaned_reviewText", "brand"]]
cleaned_review = positive_corpus["cleaned_reviewText"]
#removing stop words from all reviews
no_positivewords_review = []

for review in cleaned_review:
    word_tokens = review.split() 
#     filtered_sentence = [w for w in word_tokens if not w in pos_words] 
    filtered_sentence = [w for w in word_tokens if  w in ['screen', 'battery', 'camera', 'price', 'quality', 'life', 'charger', 'charge', 'display', 'power', 'storage', 'button']] 
    no_positivewords_review.append(' '.join(filtered_sentence))

positive_corpus["cleaned_reviewText"] = no_positivewords_review
positive_corpus.to_excel("/content/drive/My Drive/sentiment_analysis/positive_corpus.xlsx", index = False)

In [None]:
#negative corpus
negative_corpus = phone_data_top_10[phone_data_top_10.review_sentiment == 'NEGATIVE']
negative_corpus = negative_corpus[["cleaned_reviewText", "brand"]]
cleaned_review = negative_corpus["cleaned_reviewText"]

#removing stop words from all reviews
no_negativewords_review = []

for review in cleaned_review:
  if type(review) == str:
    word_tokens = review.split() 
    filtered_sentence = [w for w in word_tokens if w in ['screen', 'battery', 'camera', 'price', 'quality', 'life', 'charger', 'charge', 'display', 'power', 'storage', 'button']] 
    no_negativewords_review.append(' '.join(filtered_sentence))
  else:
    continue
 
negative_corpus["cleaned_reviewText"] = no_negativewords_review
negative_corpus.to_excel("/content/drive/My Drive/sentiment_analysis/negative_corpus.xlsx", index = False)

In [None]:
# import math
# def computeIDFDict():
#     """ Returns a dictionary whose keys are all the unique words in the
#     dataset and whose values are their corresponding idf.
#     """
#     idfDict = {}
#     for word in countDict:
#         idfDict[word] = math.log(len(cleaned_review) / countDict[word])
#     return idfDict
  
# # Stores the idf dictionary
# idfDict = computeIDFDict()