In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


# Text preprocessing

In [6]:
import re
import string
data["tweet"] = data["tweet"].apply(lambda x:" ".join(x.lower() for x in x.split()))

In [7]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test https://goo.gl/h1...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


In [8]:
data["tweet"] = data["tweet"].apply(lambda x:" ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [9]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #pregnancy test #android #apps #...
1,2,0,finally a transparant silicon case ^^ thanks t...
2,3,0,we love this! would you go? #talk #makememorie...
3,4,0,i'm wired i know i'm george i was made that wa...
4,5,1,what amazing service! apple won't even talk to...


# Remove punctuation

In [11]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)

In [14]:
data.head()



Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beaut...
1,2,0,finally a transparant silicon case thanks to ...
2,3,0,we love this would you go talk makememories un...
3,4,0,im wired i know im george i was made that way ...
4,5,1,what amazing service apple wont even talk to m...


In [13]:
data["tweet"] = data["tweet"].str.replace('\d','',regex=True)

# Remove stopwords


In [16]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 252.7 kB/s eta 0:00:00
Collecting click
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting joblib
  Downloading joblib-1.3.0-py3-none-any.whl (301 kB)
     ------------------------------------ 301.9/301.9 kB 192.4 kB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2023.6.3-cp311-cp311-win_amd64.whl (268 kB)
     ------------------------------------ 268.0/268.0 kB 211.5 kB/s eta 0:00:00
Installing collected packages: regex, joblib, click, nltk
Successfully installed click-8.1.3 joblib-1.3.0 nltk-3.8.1 regex-2023.6.3



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import nltk

In [18]:
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
with open('../static/model/corpora/stopwords/english','r') as file:
    sw = file.read().splitlines()

In [21]:
data["tweet"] = data["tweet"].apply(lambda x:" ".join(x for x in x.split() if x not in sw))

In [22]:
data["tweet"].head()

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

# Stemming

In [23]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [24]:
data["tweet"] = data["tweet"].apply(lambda x:" ".join(ps.stem(x) for x in x.split()))

In [25]:
data["tweet"].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

# Building vacabulary

In [26]:
from collections import Counter

In [30]:
vocab = Counter()

In [33]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [39]:
len(vocab)
data.shape

(7920, 3)

In [35]:
tokens = [key for key in vocab if vocab[key]>10]

In [36]:
len(tokens)

1145

In [37]:
data.shape

(7920, 3)

In [40]:
def save_vocabulary(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w',encoding="utf-8")
    file.write(data)
    file.close()
    
save_vocabulary(tokens,'../static/model/vocabulary.txt')

# Divide dataset

In [110]:
X = data['tweet']
y = data['label']

In [111]:
!pip install scikit-learn




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [112]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [113]:
X_train.shape

(6336,)

In [114]:
X_test.shape


(1584,)

In [115]:
y_test

7671    0
1089    0
1960    0
907     0
5202    0
       ..
2354    0
4197    0
5431    0
5128    0
7709    0
Name: label, Length: 1584, dtype: int64

# Vectorization

In [116]:
def vectorizer(ds,vocabulary):
    vectorized_1st = []
    
    for sentence in ds:
        sentence_1st = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_1st[i] = 1
        
        vectorized_1st.append(sentence_1st)
    vectorized_1st_new = np.asarray(vectorized_1st,dtype = np.float32)
    
    return vectorized_1st_new

In [117]:
vectorized_x_train = vectorizer(X_train,tokens)

In [118]:
vectorized_x_test = vectorizer(X_test,tokens)

In [119]:
vectorized_x_train


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [120]:
vectorized_x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [121]:
y_train

2520    0
611     0
3366    0
2532    1
6757    0
       ..
6691    1
6562    0
2243    0
4528    0
2605    0
Name: label, Length: 6336, dtype: int64

In [122]:
y_test


7671    0
1089    0
1960    0
907     0
5202    0
       ..
2354    0
4197    0
5431    0
5128    0
7709    0
Name: label, Length: 1584, dtype: int64

In [123]:
y_train.value_counts()

label
0    4718
1    1618
Name: count, dtype: int64

# handle imbalanced dataset


In [124]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
     ------------------------------------ 226.0/226.0 kB 431.5 kB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.10.1



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [125]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote,y_train_smote = smote.fit_resample(vectorized_x_train,y_train)
print(vectorized_x_train_smote.shape,y_train_smote.shape)

(9436, 1145) (9436,)


In [126]:
y_train_smote.value_counts()

label
0    4718
1    4718
Name: count, dtype: int64

# Model Traning and evalution 

In [128]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [129]:
from sklearn.metrics import accuracy_score, f1_score, precision_score,recall_score

In [130]:
def trainig_scores(y_act,y_pred):
    acc = round(accuracy_score(y_act,y_pred),3)
    pr = round(precision_score(y_act,y_pred),3)
    rec = round(recall_score(y_act,y_pred),3)
    f1 = round(f1_score(y_act,y_pred),3)
    print(f'Training scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-score = {f1}')
    
    
def validation_scores(y_act,y_pred):
    acc = round(accuracy_score(y_act,y_pred),3)
    pr = round(precision_score(y_act,y_pred),3)
    rec = round(recall_score(y_act,y_pred),3)
    f1 = round(f1_score(y_act,y_pred),3)
    print(f'Testing scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-score = {f1}')
    

# Logistic Regression

In [136]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred = lr.predict(vectorized_x_train_smote)
y_test_pred = lr.predict(vectorized_x_test)
trainig_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)

Training scores:
	Accuracy = 0.942
	Precision = 0.919
	Recall = 0.969
	F1-score = 0.943
Testing scores:
	Accuracy = 0.865
	Precision = 0.702
	Recall = 0.826
	F1-score = 0.759


# Naive Bayes

In [137]:
mnb = MultinomialNB()
mnb.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred = mnb.predict(vectorized_x_train_smote)
y_test_pred = mnb.predict(vectorized_x_test)
trainig_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)

Training scores:
	Accuracy = 0.906
	Precision = 0.87
	Recall = 0.954
	F1-score = 0.91
Testing scores:
	Accuracy = 0.866
	Precision = 0.68
	Recall = 0.904
	F1-score = 0.776


# Decision tree

In [138]:
dt = DecisionTreeClassifier()
dt.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred = dt.predict(vectorized_x_train_smote)
y_test_pred = dt.predict(vectorized_x_test)
trainig_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)

Training scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 0.999
	F1-score = 1.0
Testing scores:
	Accuracy = 0.836
	Precision = 0.7
	Recall = 0.64
	F1-score = 0.668


# Random Forest

In [141]:
rf = RandomForestClassifier()
rf.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred = rf.predict(vectorized_x_train_smote)
y_test_pred = rf.predict(vectorized_x_test)
trainig_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)

Training scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 0.999
	F1-score = 1.0
Testing scores:
	Accuracy = 0.863
	Precision = 0.755
	Recall = 0.694
	F1-score = 0.723


# Support vector machine

In [142]:
svc = SVC()
svc.fit(vectorized_x_train_smote,y_train_smote)
y_train_pred = svc.predict(vectorized_x_train_smote)
y_test_pred = svc.predict(vectorized_x_test)
trainig_scores(y_train_smote,y_train_pred)
validation_scores(y_test,y_test_pred)

Training scores:
	Accuracy = 0.978
	Precision = 0.962
	Recall = 0.997
	F1-score = 0.979
Testing scores:
	Accuracy = 0.877
	Precision = 0.74
	Recall = 0.804
	F1-score = 0.771
