In [30]:
import numpy as np
import pandas as pd



In [31]:
data = pd.read_csv('../dataset/sentiment_analysis.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [32]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

## Text Preprocessing

In [33]:
import re
import string

convert upper case to lower case

In [34]:
data["tweet"] =  data["tweet"].apply(lambda x:" ".join(x.lower() for x in x.split()))
data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Links


In [35]:
data["tweet"] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))
data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

Remove Punctuations


In [36]:
def remove_punctuations(text):
    for punctuations in string.punctuation:
        text = text.replace(punctuations, ' ')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)
data["tweet"].head(5)

0     fingerprint  pregnancy test   android  apps  ...
1    finally a transparant silicon case    thanks t...
2    we love this  would you go   talk  makememorie...
3    i m wired i know i m george i was made that wa...
4    what amazing service  apple won t even talk to...
Name: tweet, dtype: object

Remove Numbers


In [37]:
data["tweet"] = data['tweet'].str.replace(r'\d+', '', regex=True)
data.head()


Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps ...
1,2,0,finally a transparant silicon case thanks t...
2,3,0,we love this would you go talk makememorie...
3,4,0,i m wired i know i m george i was made that wa...
4,5,1,what amazing service apple won t even talk to...


Remove Stopwords

In [38]:
import nltk

In [39]:
nltk.download('stopwords', download_dir='../Static/model')

[nltk_data] Downloading package stopwords to ../Static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [41]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [42]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

### Building Vacabulary

In [43]:
from collections import Counter
vocab = Counter()

In [44]:
for sentences in data["tweet"]:
    vocab.update(sentences.split())

In [45]:
tokens = [key for key in vocab if vocab[key] > 10]
len(tokens)

1169

In [46]:

def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

### Divede dataset for tarin and test

In [47]:
X = data["tweet"]
y = data["label"]

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Vectorization


In [49]:
def vectorizer (ds, vocabulary):
    vectorized_list = []

    for sentences in ds:
        sentences_list = np.zeros(len(vocabulary))

        for i in range(len(vocabulary)):
            if vocabulary[i] in sentences.split():
                sentences_list[i] = 1
        
        vectorized_list.append(sentences_list)

    vectorized_list_new = np.asarray(vectorized_list,dtype=np.float32)

    return vectorized_list_new

In [50]:
vectorized_x_train = vectorizer(X_train, tokens)
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [51]:
vectorized_x_test = vectorizer(X_test, tokens)

### Handle imbalance dataset

In [52]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9422, 1169) (9422,)


## Model Training and Evaluation

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [54]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')
    
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

### Logistic Regression

In [59]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote, y_train_smote)

y_train_pred = lr.predict(vectorized_x_train_smote)
y_test_pred = lr.predict(vectorized_x_test)
training_scores(y_train_smote, y_train_pred)
validation_scores(y_test, y_test_pred)

Training Scores:
	Accuracy = 0.94
	Precision = 0.915
	Recall = 0.97
	F1-Score = 0.942
Testing Scores:
	Accuracy = 0.878
	Precision = 0.716
	Recall = 0.855
	F1-Score = 0.78
