In [155]:
import pandas as pd 
import nltk 
from nltk.corpus import stopwords
import re 
from nltk.tokenize import word_tokenize
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


In [156]:
data = pd.read_csv(r'C:\Users\besha\OneDrive\Desktop\ODC\sentiment_NLP\dataset\amazon_alexa.tsv' , sep='\t')

In [157]:
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [158]:
data.isnull().sum()
data.dropna(inplace = True)

In [159]:
data['feedback'].value_counts()

feedback
1    2893
0     256
Name: count, dtype: int64

In [160]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\besha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\besha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [161]:
x = data['verified_reviews']
y = data['feedback']

In [162]:
## 1.Lower case
data['verified_reviews'] = data['verified_reviews'].str.lower()

In [163]:
## 2.special character

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text
data['verified_reviews'] = data['verified_reviews'].apply(clean_text)

In [164]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\besha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [165]:
## 3.tokenize 
def tokenize(text):
    return word_tokenize(text)
data['verified_reviews'] = data['verified_reviews'].apply(tokenize)

In [166]:
## 4.remove stopwords
def remove_stopwords(text):
    stop = stopwords.words('english')
    return [word for word in text if word not in stop]
data['verified_reviews'] = data['verified_reviews'].apply(remove_stopwords) 

In [167]:
### 5.lemmetization
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
def stimizer(text):
    return [stemmer.stem(word) for word in text]
data['verified_reviews'] = data['verified_reviews'].apply(stimizer)

In [168]:
data['verified_reviews']

0                                            [love, echo]
1                                                  [love]
2       [sometim, play, game, answer, question, correc...
3       [lot, fun, thing, yr, old, learn, dinosaur, co...
4                                                 [music]
                              ...                        
3145                       [perfect, kid, adult, everyon]
3146    [listen, music, search, locat, check, time, lo...
3147    [love, thing, run, entir, home, tv, light, the...
3148    [complaint, sound, qualiti, great, mostli, use...
3149                                               [good]
Name: verified_reviews, Length: 3149, dtype: object

In [169]:
data['verified_reviews'] = data['verified_reviews'].apply(lambda x : ' '.join(x))

In [170]:
data['verified_reviews']

0                                               love echo
1                                                    love
2       sometim play game answer question correctli al...
3       lot fun thing yr old learn dinosaur control li...
4                                                   music
                              ...                        
3145                            perfect kid adult everyon
3146    listen music search locat check time look weat...
3147    love thing run entir home tv light thermostat ...
3148    complaint sound qualiti great mostli use comma...
3149                                                 good
Name: verified_reviews, Length: 3149, dtype: object

In [171]:
tfidf = TfidfVectorizer()
tf_x = tfidf.fit_transform(data['verified_reviews'])
# TF-DIF

In [172]:
smote = SMOTE()
x_smote , y_smote = smote.fit_resample(tf_x , data['feedback'])

In [173]:
y_smote.value_counts()

feedback
1    2893
0    2893
Name: count, dtype: int64

In [174]:
y = y_smote 
x = x_smote  

In [175]:
x_train , x_test , y_train , y_test =train_test_split(x , y)

In [176]:
lr = LogisticRegression()
lr.fit(x_train , y_train)

In [177]:
pred_train = lr.predict(x_train)
pred_test = lr.predict(x_test)
print(accuracy_score(y_train , pred_train))
print(accuracy_score(y_test , pred_test))

0.9472228624106938
0.9198341395991707


In [178]:
desicion_tree = DecisionTreeClassifier()
desicion_tree.fit(x_train , y_train)

In [179]:
pred_train = desicion_tree.predict(x_train)
pred_test = desicion_tree.predict(x_test)
print(accuracy_score(y_train , pred_train))
print(accuracy_score(y_test , pred_test))

0.9898594146116617
0.9343469246717346


In [180]:
svc = SVC()
svc.fit(x_train , y_train)

In [181]:
pred_train = svc.predict(x_train)
pred_test = svc.predict(x_test)
print(accuracy_score(y_train , pred_train))
print(accuracy_score(y_test , pred_test))

0.9889375432127219
0.9806496199032481


In [182]:
knn = KNeighborsClassifier()
knn.fit(x_train , y_train)

In [183]:
pred_train = knn.predict(x_train)
pred_test = knn.predict(x_test)
print(accuracy_score(y_train , pred_train))
print(accuracy_score(y_test , pred_test))

0.6619036644388108
0.6344160331720802


In [185]:
#### Grid Search
param = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'C':[1, 10, 100, 1000] , 'gamma' : ['scale' , 'auto'] , 'decision_function_shape' : ['ovo' , 'ovr']}
grid = GridSearchCV(svc, param)
grid.fit(x_train, y_train)

In [186]:
grid.best_estimator_

In [187]:
grid.best_score_

np.float64(0.9852497355678513)

In [188]:
model = grid.best_estimator_
model.predict(x_test)

array([1, 0, 1, ..., 1, 0, 1])

In [190]:
def text_preprocessing(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text =  word_tokenize(text)
    stop = stopwords.words('english')
    text = [word for word in text if word not in stop] ### list comprehnsion
    text = [stemmer.stem(word) for word in text]
    text = ' '.join(text)
    text = tfidf.transform([text])
    return text

text = text_preprocessing("I do like it ")
lr.predict(text)

array([1])

In [191]:
import pickle
pickle.dump(tfidf, open('tf.pkl', 'wb'))
pickle.dump(svc, open('svc.pkl', 'wb'))