# DATA CLEANING, SENTIMENTAL ANALYSIS, AND RECOMMENDATION

### Importing packages for sentimental analysis

#### So, we are giving our dataset as test data to already trained algorithm.

In [1]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis")


  from .autonotebook import tqdm as notebook_tqdm
2022-10-20 21:07:14.005687: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-20 21:07:14.275510: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-20 21:07:14.349696: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-20 21:07:14.349733: I tensorflow/stream_executor/cuda/cu

#### Installing transformers package

In [2]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.23.1-py3-none-any.whl (5.3 MB)
Collecting regex!=2019.12.17
  Using cached regex-2022.9.13-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Collecting tqdm>=4.27
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Installing collected packages: regex, tokenizers, tqdm, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 regex-2022.9.13 tokenizers-0.13.1 tqdm-4.64.1 transformers-4.23.1


### Importing other libaries like pandas, pyarrow, matplotlib and seaborn

In [10]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns

### Adding our first dataset

In [11]:
data_df = pd.read_parquet("dataset/data_1.parquet")

In [4]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,price,brand,rating,no_rating
0,0,2,"Redmi 9A Sport (Coral Green, 2GB RAM, 32GB Sto...",6999,Redmi,4.2,"246,233 ratings"
1,1,5,"Poco M3 Pro 5G (Cool Blue, 4GB RAM, 64GB Storage)",13800,Generic,3.8,209 ratings
2,2,6,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,"130,124 ratings"
3,3,7,"MOTOROLA G40 Fusion (Frosted Champagne, 64 GB)...",15190,Motorola,3.8,31 ratings
4,4,8,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,"100,452 ratings"


### Finding number of brands we have.

In [5]:
categories = list(data_df["brand"].unique())


In [6]:
len = len(categories)

In [7]:
len

29

In [8]:
data_df.describe()

Unnamed: 0.1,Unnamed: 0,id,rating
count,219.0,219.0,219.0
mean,556.105023,940.990868,3.920548
std,501.40071,922.133913,0.518278
min,0.0,2.0,1.0
25%,111.5,188.5,3.8
50%,457.0,541.0,4.0
75%,925.5,1407.5,4.2
max,1659.0,3268.0,5.0


In [7]:
data_df["no_rating"] = data_df["no_rating"].str.replace("ratings","")
data_df["no_rating"] = data_df["no_rating"].str.replace(",","")
data_df["no_rating"] = data_df["no_rating"].str.replace("rating","")
data_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,price,brand,rating,no_rating
0,0,2,"Redmi 9A Sport (Coral Green, 2GB RAM, 32GB Sto...",6999,Redmi,4.2,246233
1,1,5,"Poco M3 Pro 5G (Cool Blue, 4GB RAM, 64GB Storage)",13800,Generic,3.8,209
2,2,6,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,130124
3,3,7,"MOTOROLA G40 Fusion (Frosted Champagne, 64 GB)...",15190,Motorola,3.8,31
4,4,8,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,100452


In [8]:
data_df["no_rating"] = pd.to_numeric(data_df["no_rating"])


In [9]:
data_df.describe()

Unnamed: 0.1,Unnamed: 0,id,rating,no_rating
count,219.0,219.0,219.0,219.0
mean,556.105023,940.990868,3.920548,22629.863014
std,501.40071,922.133913,0.518278,53528.784516
min,0.0,2.0,1.0,1.0
25%,111.5,188.5,3.8,149.5
50%,457.0,541.0,4.0,2368.0
75%,925.5,1407.5,4.2,21293.5
max,1659.0,3268.0,5.0,246243.0


## FILTERING DATA BASED ON NUMBER OF RATING

### By, considering the fact that, products with more number of rating , will have beemn bought more number people

In [10]:
new_df =  data_df[(data_df["no_rating"]<220000) & (data_df["no_rating"]>2368)]

In [27]:
new_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,price,brand,rating,no_rating
2,2,6,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,130124
4,4,8,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,100452
5,5,18,"Redmi Note 10S (Frost White, 6GB RAM, 64GB Sto...",13999,Redmi,4.2,80226
6,6,22,"Redmi Note 11 Pro + 5G (Stealth Black, 8GB RAM...",23999,Redmi,3.8,19379
7,7,23,Redmi 10 Prime (Phantom Black 4GB RAM 64GB | H...,10999,Xiaomi,4.1,55753


### REMOVING EMOJIS FROM THE SCRAPPED REVIEWS

In [16]:
import re

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(string))

In [12]:
review_df = pd.read_parquet("dataset/review_2.parquet")

In [13]:
review_df.head()

Unnamed: 0,id,name,data,reviewers_rating,review
0,0,Pranjal Dev,"Reviewed in India on October 15, 2022",5.0,"Very good phone. Camera, battery,body design a..."
1,0,Aaranyak B.,"Reviewed in India on October 13, 2022",5.0,Wanted to buy the Note 10 Pro for my mom. She ...
2,0,Mohit,"Reviewed in India on October 11, 2022",5.0,"Nice looking, battery backup everything is goo..."
3,0,siddhant bhardwaj,"Reviewed in India on October 10, 2022",5.0,Chinese product promote made in india products...
4,0,Naved Miyan,"Reviewed in India on October 10, 2022",5.0,I like it


In [14]:
new_df= review_df

In [17]:
new_df["review"] = new_df["review"].apply(lambda x: remove_emoji(x))

### REMOVING THE HTML TAGS

In [18]:
def remove_tags(string):
    return re.sub(r"<\w*>","",string)

In [19]:
new_df["review"] = new_df["review"].apply(lambda x: remove_tags(x))

### GIVING THE CLEANED DATA AS TEST DATA FOR SENTIMENTAL ANALYSIS

In [23]:
sentiment = []
for i in new_df["review"]:
    try:
        sentiment.append(sentiment_pipeline(i)[0]["label"])
    except:
        sentiment.append("null")

In [24]:
new_df["sentiment"] = sentiment

### DATASET AFTER SENTIMENTAL ANALYSIS

In [25]:
new_df.head()

Unnamed: 0,id,name,data,reviewers_rating,review,sentiment
0,0,Pranjal Dev,"Reviewed in India on October 15, 2022",5.0,"Very good phone. Camera, battery,body design a...",POSITIVE
1,0,Aaranyak B.,"Reviewed in India on October 13, 2022",5.0,Wanted to buy the Note 10 Pro for my mom. She ...,POSITIVE
2,0,Mohit,"Reviewed in India on October 11, 2022",5.0,"Nice looking, battery backup everything is goo...",POSITIVE
3,0,siddhant bhardwaj,"Reviewed in India on October 10, 2022",5.0,Chinese product promote made in india products...,NEGATIVE
4,0,Naved Miyan,"Reviewed in India on October 10, 2022",5.0,I like it,POSITIVE


In [None]:
new1_df = new_df

In [46]:
d1_list=[]
d2_list=[]
for i in range(0,99):
    d1=new1_df[(new1_df['id']==i) & (new1_df['positive/negative']=='POSITIVE')]
    print("reviews" + str(i) + "-- positive review--" + str(len(d1)))
    d1_list.append((len(d1)/30)*100)
    d2=new1_df[(new1_df['id']==i) & (new1_df['positive/negative']=='NEGATIVE')]
    print("reviews" + str(i) + "-- negative review--" + str(len(d2)))
    d2_list.append((len(d2)/30)*100)


reviews0-- positive review--19
reviews0-- negative review--11
reviews1-- positive review--19
reviews1-- negative review--10
reviews2-- positive review--12
reviews2-- negative review--16
reviews3-- positive review--8
reviews3-- negative review--22
reviews4-- positive review--8
reviews4-- negative review--22
reviews5-- positive review--15
reviews5-- negative review--12
reviews6-- positive review--15
reviews6-- negative review--12
reviews7-- positive review--8
reviews7-- negative review--16
reviews8-- positive review--18
reviews8-- negative review--10
reviews9-- positive review--17
reviews9-- negative review--10
reviews10-- positive review--13
reviews10-- negative review--17
reviews11-- positive review--8
reviews11-- negative review--16
reviews12-- positive review--12
reviews12-- negative review--18
reviews13-- positive review--13
reviews13-- negative review--17
reviews14-- positive review--18
reviews14-- negative review--10
reviews15-- positive review--15
reviews15-- negative review--11


In [47]:
new2_df = pd.read_parquet("dataset/data_2.parquet")

In [48]:
new2_df['No. of Positive']=d1_list
new2_df['No. of Negative']=d2_list


In [49]:
new2_df.head(10)

Unnamed: 0,id,title,price,brand,rating,no_rating,No. of Positive,No. of Negative
0,0,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,130124,63.333333,36.666667
1,1,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,100452,63.333333,33.333333
2,2,"Redmi Note 10S (Frost White, 6GB RAM, 64GB Sto...",13999,Redmi,4.2,80226,40.0,53.333333
3,3,"Redmi Note 11 Pro + 5G (Stealth Black, 8GB RAM...",23999,Redmi,3.8,19379,26.666667,73.333333
4,4,Redmi 10 Prime (Phantom Black 4GB RAM 64GB | H...,10999,Xiaomi,4.1,55753,26.666667,73.333333
5,5,"Redmi Note 11 (Horizon Blue, 6GB RAM, 128GB St...",14499,Redmi,4.1,40519,50.0,40.0
6,6,"Redmi Note 11 (Space Black, 6GB RAM, 64GB Stor...",13999,Redmi,4.1,40519,50.0,40.0
7,7,"Redmi 10A (Slate Grey, 4GB RAM, 64GB Storage) ...",8599,Redmi,3.8,5803,26.666667,53.333333
8,8,"Redmi Note 11T 5G (Matte Black, 8GB RAM, 128GB...",16999,Redmi,4.1,23497,60.0,33.333333
9,9,"OnePlus Nord CE 2 5G (Gray Mirror, 8GB RAM, 12...",24999,OnePlus,4.3,50123,56.666667,33.333333


In [50]:
new_df.to_csv('final_data.csv', header=False, index=False)


In [62]:
new4_df = pd.read_csv("dataset/final_data.csv")

In [63]:
new4_df.head(10)

Unnamed: 0,id,title,price,brand,rating,no_rating,positive,negative
0,0,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,130124,63.333333,36.666667
1,1,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,100452,63.333333,33.333333
2,2,"Redmi Note 10S (Frost White, 6GB RAM, 64GB Sto...",13999,Redmi,4.2,80226,40.0,53.333333
3,3,"Redmi Note 11 Pro + 5G (Stealth Black, 8GB RAM...",23999,Redmi,3.8,19379,26.666667,73.333333
4,4,Redmi 10 Prime (Phantom Black 4GB RAM 64GB | H...,10999,Xiaomi,4.1,55753,26.666667,73.333333
5,5,"Redmi Note 11 (Horizon Blue, 6GB RAM, 128GB St...",14499,Redmi,4.1,40519,50.0,40.0
6,6,"Redmi Note 11 (Space Black, 6GB RAM, 64GB Stor...",13999,Redmi,4.1,40519,50.0,40.0
7,7,"Redmi 10A (Slate Grey, 4GB RAM, 64GB Storage) ...",8599,Redmi,3.8,5803,26.666667,53.333333
8,8,"Redmi Note 11T 5G (Matte Black, 8GB RAM, 128GB...",16999,Redmi,4.1,23497,60.0,33.333333
9,9,"OnePlus Nord CE 2 5G (Gray Mirror, 8GB RAM, 12...",24999,OnePlus,4.3,50123,56.666667,33.333333


In [67]:
new4_df = new4_df[new4_df['positive']>new4_df['negative']]

In [68]:
new4_df.head(10)

Unnamed: 0,id,title,price,brand,rating,no_rating,positive,negative
0,0,"Redmi Note 9 Pro (Interstellar Black, 4GB RAM,...",17999,Redmi,4.3,130124,63.333333,36.666667
1,1,"Redmi 8A Dual (Midnight Grey, 3GB RAM, 64GB St...",9590,Redmi,4.1,100452,63.333333,33.333333
5,5,"Redmi Note 11 (Horizon Blue, 6GB RAM, 128GB St...",14499,Redmi,4.1,40519,50.0,40.0
6,6,"Redmi Note 11 (Space Black, 6GB RAM, 64GB Stor...",13999,Redmi,4.1,40519,50.0,40.0
8,8,"Redmi Note 11T 5G (Matte Black, 8GB RAM, 128GB...",16999,Redmi,4.1,23497,60.0,33.333333
9,9,"OnePlus Nord CE 2 5G (Gray Mirror, 8GB RAM, 12...",24999,OnePlus,4.3,50123,56.666667,33.333333
14,14,"Xiaomi 11T Pro 5G Hyperphone (Celestial Magic,...",36999,Xiaomi,4.0,4603,60.0,33.333333
15,15,"OnePlus Nord CE 2 5G (Bahamas Blue, 8GB RAM, 1...",24999,OnePlus,4.3,50129,50.0,36.666667
20,20,"OPPO A74 5G (Fluid Black, 6GB RAM, 128GB Stora...",14990,Oppo,4.2,29547,60.0,33.333333
22,22,"realme narzo 50 (Speed Blue, 6GB RAM+128GB Sto...",13249,Realme,4.3,9074,60.0,33.333333


In [69]:
new4_df.describe()

Unnamed: 0,id,price,rating,no_rating,positive,negative
count,37.0,37.0,37.0,37.0,37.0,37.0
mean,46.567568,17106.783784,4.124324,24156.27027,58.018018,35.675676
std,31.013922,8409.973289,0.155287,27393.67276,8.106523,6.613905
min,0.0,99.0,3.8,4213.0,46.666667,23.333333
25%,22.0,10999.0,4.0,5953.0,50.0,33.333333
50%,42.0,14499.0,4.1,11944.0,56.666667,36.666667
75%,77.0,23999.0,4.3,33910.0,60.0,40.0
max,98.0,36999.0,4.3,130124.0,76.666667,46.666667
