<a href="https://colab.research.google.com/github/AhmedGabl/Sentiment-Analysis-AmazonReviews/blob/main/amazonReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk   ### nltk / spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [None]:
data = pd.read_csv('amazon_alexa.tsv', sep='\t')
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [None]:
data.isna().sum()

Unnamed: 0,0
rating,0
date,0
variation,0
verified_reviews,1
feedback,0


In [None]:
data.dropna(inplace=True)

In [None]:
data.duplicated().sum()

715

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
### Classification
### check data imbalance
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,1756
4,345
1,141
3,113
2,79


In [None]:
data['feedback'].value_counts()

Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2214
0,220


In [None]:
#### Data is imbalanced
#### Target ---> feedback
#### data ---> verified_reviews

In [None]:
#### 1.Lower case
data['verified_reviews'] = data['verified_reviews'].str.lower()

In [None]:
#### 2.special charcter
def clean_text(text):
    text = re.sub('[^a-zA-z]', ' ', text)
    return text
data['verified_reviews'] = data['verified_reviews'].apply(clean_text)



In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
### 3.Tokinzation
def tokenize(text):
    return word_tokenize(text)
data['verified_reviews'] = data['verified_reviews'].apply(tokenize)

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,"[love, my, echo]"
1,"[loved, it]"
2,"[sometimes, while, playing, a, game, you, can,..."
3,"[i, have, had, a, lot, of, fun, with, this, th..."
4,[music]
...,...
2796,"[i, do, love, these, things, i, have, them, ru..."
2797,"[only, complaint, i, have, is, that, the, soun..."
2798,[good]
2799,"[nice, little, unit, no, issues]"


In [None]:
### 4.stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
### exclude not
stop_words.remove('not')
stop_words.remove('no')

def remove_stopwords(text):
    return [word for word in text if word not in stop_words] ### list comprehnsion
data['verified_reviews'] = data['verified_reviews'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# prompt: lemmatizetion i think umissed the part wher it needs to know if the word type num or verb or ..

import nltk

# Download the necessary resources
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng') # Download the missing resource

lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    lemmatized_words = []
    for word in text:
        pos_tag = nltk.pos_tag([word])[0][1] # Get POS tag for each word
        pos = get_wordnet_pos(pos_tag)
        if pos:
          lemmatized_words.append(lemmatizer.lemmatize(word, pos=pos)) # Lemmatize with POS tag
        else:
          lemmatized_words.append(lemmatizer.lemmatize(word)) # Default lemmatization if no valid POS tag
    return lemmatized_words

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return None


data['verified_reviews'] = data['verified_reviews'].apply(lemmatize_text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [None]:
### 5.lemmetization
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stimizer(text):
    return [stemmer.stem(word) for word in text]
data['verified_reviews'] = data['verified_reviews'].apply(stimizer)

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,"[love, echo]"
1,[love]
2,"[sometimes, play, game, answer, question, corr..."
3,"[lot, fun, thing, yr, old, learns, dinosaur, c..."
4,[music]
...,...
2796,"[love, thing, run, entire, home, tv, light, th..."
2797,"[complaint, sound, quality, great, mostly, use..."
2798,[good]
2799,"[nice, little, unit, no, issue]"


In [None]:
###
data['verified_reviews'] = data['verified_reviews'].apply(lambda x : ' '.join(x))

In [None]:
data['verified_reviews']

Unnamed: 0,verified_reviews
0,love echo
1,love
2,sometimes play game answer question correctly ...
3,lot fun thing yr old learns dinosaur control l...
4,music
...,...
2796,love thing run entire home tv light thermostat...
2797,complaint sound quality great mostly use comma...
2798,good
2799,nice little unit no issue


In [None]:
bow = CountVectorizer()
bow_x = bow.fit_transform(data['verified_reviews'])

In [None]:
tf = TfidfVectorizer()
tf_x = tf.fit_transform(data['verified_reviews'])

In [None]:
### fix data imbalance
from imblearn.over_sampling import SMOTE
sm = SMOTE()
x_sm, y_sm = sm.fit_resample(tf_x, data['feedback'])

In [None]:
y_sm.value_counts()


Unnamed: 0_level_0,count
feedback,Unnamed: 1_level_1
1,2214
0,2214


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm)

In [None]:
dt = DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=5)
dt.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, dt.predict(x_train)))
print(accuracy_score(y_test, dt.predict(x_test)))

0.9373682625715146
0.8825654923215899


In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, lr.predict(x_train)))
print(accuracy_score(y_test, lr.predict(x_test)))

0.9467028003613369
0.9150858175248419


In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)

In [None]:
print(accuracy_score(y_train, svc.predict(x_train)))
print(accuracy_score(y_test, svc.predict(x_test)))

0.989159891598916
0.993676603432701


In [None]:
#### Grid Search
from sklearn.model_selection import GridSearchCV
param = {'kernel':['linear', 'rbf', 'poly', 'sigmoid'], 'C':[1, 10, 100, 1000]}
grid = GridSearchCV(svc, param)
grid.fit(x_train, y_train)

In [None]:
grid.best_params_

{'C': 10, 'kernel': 'rbf'}

In [None]:
grid.best_score_

0.9786207989854153

In [None]:
e  = grid.best_estimator_

In [None]:
e.predict(x_test)

array([0, 0, 0, ..., 1, 1, 0])

In [None]:
### SVM , dt , lr
def text_preprocessing(text):
  ## lower case
  text = text.lower()
  ## special charcter
  text = re.sub('[^a-zA-z]', ' ', text)
  ## Tokinzation
  text = word_tokenize(text)
  ## stopwords
  text = [word for word in text if word not in stop_words]
  ## lemmetization
  text = [stemmer.stem(word) for word in text]
  text = ' '.join(text)
  text = tf.transform([text])
  return text

text = text_preprocessing("I donot like it ")
lr.predict(text)

In [None]:
### SVM , dt , lr
# ... (your existing imports and functions) ...

def text_preprocessing(text):
  ## lower case
  text = text.lower()
  ## special charcter
  text = re.sub('[^a-zA-z]', ' ', text)
  ## Tokinzation
  text = word_tokenize(text)
  ## stopwords
  text = [word for word in text if word not in stop_words]
  ## Lemmatization using your custom function
  text = lemmatize_text(text) # Call your lemmatize_text function
  text = ' '.join(text)
  text = tf.transform([text])
  return text

text = text_preprocessing("I dont like it ")
lr.predict(text)


array([0])

In [None]:
lemmatize_text("I donot like it ")

['I',
 ' ',
 'd',
 'o',
 'n',
 'o',
 't',
 ' ',
 'l',
 'i',
 'k',
 'e',
 ' ',
 'i',
 't',
 ' ']

In [None]:
import pickle
pickle.dump(lr, open('lr.pkl', 'wb'))
pickle.dump(tf, open('tf.pkl', 'wb'))
pickle.dump(dt, open('dt.pkl', 'wb'))
pickle.dump(svc, open('svc.pkl', 'wb'))
