In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import  CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv(r'C:\Users\amr22\OneDrive\Documents\Data-Set\Amazon-Alexa\amazon_alexa.tsv', sep='\t')
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [3]:
data['feedback'].value_counts()

feedback
1    2893
0     257
Name: count, dtype: int64

In [4]:
data['rating'].value_counts()

rating
5    2286
4     455
1     161
3     152
2      96
Name: count, dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3149 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [6]:
data.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [7]:
data.duplicated().sum()

715

In [8]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [9]:
data.isnull().sum()

rating              0
date                0
variation           0
verified_reviews    0
feedback            0
dtype: int64

In [10]:
data.duplicated().sum()

0

In [11]:
data.shape

(2434, 5)

In [12]:
s = 'Amr Khaled 01156239572@#$$$$$$$$$$#####'
re.sub('[^a-zA-Z]', ' ', s)

'Amr Khaled                             '

In [13]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amr22\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amr22\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
data['verified_reviews'] = data['verified_reviews'].str.lower()
data['verified_reviews'] = data['verified_reviews'].apply(lambda x : re.sub('[^a-zA-Z]', ' ', x))
tokens = data['verified_reviews'].apply(lambda x : word_tokenize(x))
stop_words = set(stopwords.words('english'))
filtered_tokens = tokens.apply(lambda x : [word for word in x if word not in stop_words])
stemmer = PorterStemmer()
stemmed_tokens = filtered_tokens.apply(lambda x : [stemmer.stem(word) for word in x])

In [15]:
stemmed_tokens

0                                            [love, echo]
1                                                  [love]
2       [sometim, play, game, answer, question, correc...
3       [lot, fun, thing, yr, old, learn, dinosaur, co...
4                                                 [music]
                              ...                        
2796    [love, thing, run, entir, home, tv, light, the...
2797    [complaint, sound, qualiti, great, mostli, use...
2798                                               [good]
2799                            [nice, littl, unit, issu]
2800    [echo, dot, easi, set, use, help, provid, musi...
Name: verified_reviews, Length: 2434, dtype: object

In [16]:
data['verified_reviews']

0                                           love my echo 
1                                               loved it 
2       sometimes while playing a game  you can answer...
3       i have had a lot of fun with this thing  my   ...
4                                                   music
                              ...                        
2796    i do love these things  i have them running my...
2797    only complaint i have is that the sound qualit...
2798                                                 good
2799                          nice little unit  no issues
2800    the echo dot was easy to set up and use  it he...
Name: verified_reviews, Length: 2434, dtype: object

In [17]:
x = stemmed_tokens.apply(lambda x: ' '.join(x))
y = data['feedback']

In [18]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)
x.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)

In [20]:
y_resampled.value_counts()

feedback
1    2214
0    2214
Name: count, dtype: int64

In [21]:
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(x_resampled, y_resampled, stratify=y_resampled, test_size=0.2, random_state=42)

model_2 = RandomForestClassifier()
model_2.fit(x_train_smote, y_train_smote)

In [22]:
accuracy_score(y_train_smote, model_2.predict(x_train_smote)), accuracy_score(y_test_smote, model_2.predict(x_test_smote))

(0.98729531338227, 0.927765237020316)

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [24]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)

In [25]:
y_pred_train = model.predict(x_train)
accuracy_score(y_train, y_pred_train)

0.9938366718027735

In [26]:
y_pred_test = model.predict(x_test)
accuracy_score(y_test, y_pred_test)

0.893223819301848

In [27]:
model_1 = RandomForestClassifier()
model_1.fit(x_train, y_train)

In [28]:
y_pred_train = model_1.predict(x_train)
accuracy_score(y_train, y_pred_train)

0.9938366718027735

In [29]:
y_pred_test = model_1.predict(x_test)
accuracy_score(y_test, y_pred_test)

0.9096509240246407

In [30]:
model.get_depth()

165

In [31]:
def preprocessing_step(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    stemmed_tokens = ' '.join(stemmed_tokens)
    return stemmed_tokens 

In [32]:
review = ['This is amazing product????',
          'this is bad product ever i hate it ??##$$']

for text in review:
    word = preprocessing_step(text)
    # print(word)
    vector_data = vectorizer.transform([word])
    prediction = model_2.predict(vector_data)
    print(prediction)

[1]
[0]


In [33]:
# pickle.dump(model_2, open('model.pkl', 'wb'))
# pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))