In [24]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import pickle
from bs4 import BeautifulSoup
%matplotlib inline

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS as sw

In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_core_interactivity = "all"

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
data = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv")
data.head()
# 1 -pos, 0- neg

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


In [7]:
#Data Prep
data["sentiment"].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

In [8]:
data.isna().sum()

twitts       0
sentiment    0
dtype: int64

In [10]:
data[data.duplicated()]  #Same wishes can appear multiple times

Unnamed: 0,twitts,sentiment
4413,good morning!,1
7654,"libertines reunion i need a haircut, its gone...",1
8027,isPlayer Has Died! Sorry,0
8766,"cant afford to see Angels and Demons, so i wa...",0
11413,good morning!,1
13322,My tummy hurts,0
14252,headache,0
14960,Headache,0
14996,headache,0
15000,isPlayer Has Died! Sorry,0


In [11]:
#Model Training

def SVM(data):
    X = data["twitts"]
    y = data["sentiment"]

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =0,test_size=0.2,stratify = y)

    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(y_pred.shape)
    
    print(classification_report(y_test,y_pred))
    
    return tfidf,clf

In [12]:
%%time
tfidf, clf = SVM(data)

(24000, 40854)
(6000, 40854)
(24000,)
(6000,)
(6000,)
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

Wall time: 3.93 s


In [13]:
x = ["very very pleasant bro"]
clf.predict(tfidf.transform(x))

array([1], dtype=int64)

### Data Cleaning and Retraining

In [14]:
data.head()


Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0


In [15]:
#lowercase conversion
data["twitts"] = data["twitts"].apply(lambda x: str(x).lower())


In [16]:
#Contarction to expansion

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring'}

def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x


data["twitts"] = data["twitts"].apply(lambda x: cont_to_exp(x))

In [17]:
data.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0


In [18]:
SVM(data)

(24000, 40846)
(6000, 40846)
(24000,)
(6000,)
(6000,)
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

In [25]:
#remove emails
data["twitts"] = data["twitts"].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"",x))

#remove urls
data["twitts"] = data["twitts"].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , x))

#remove rt
data["twitts"] = data["twitts"].apply(lambda x: re.sub('\brt\b','', x).strip())

#remove html tags
data["twitts"] = data["twitts"].apply(lambda x: BeautifulSoup(x,"lxml").get_text().strip())

#remove spcl chars
data["twitts"] = data["twitts"].apply(lambda x: re.sub('[^\w ]+',"", x).strip())

#remove extra spaces
data["twitts"] = data["twitts"].apply(lambda x: " ".join(x.split()))

In [28]:
#remove accented chars

import unicodedata
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

data["twitts"] = data["twitts"].apply(lambda x: remove_accented_chars(x))

In [30]:
#remove stopwords
data["twitts"] = data["twitts"].apply(lambda x: ' '.join([t for t in x.split() if t not in sw]))

In [31]:
tfidf,clf = SVM(data)

(24000, 42567)
(6000, 42567)
(24000,)
(6000,)
(6000,)
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      3000
           1       0.72      0.73      0.72      3000

    accuracy                           0.72      6000
   macro avg       0.72      0.72      0.72      6000
weighted avg       0.72      0.72      0.72      6000



In [33]:
#Lemma
nlp = spacy.load('en_core_web_sm')
def make_to_base(x):
    x = str(x)
    x_list = []
    doc = nlp(x)
    
    for token in doc:
        lemma = token.lemma_
        if lemma == '-PRON-' or lemma == 'be':
            lemma = token.text

        x_list.append(lemma)
    return ' '.join(x_list)

data["twitts"] = data["twitts"].apply(lambda x: make_to_base(x))

KeyboardInterrupt: 

In [34]:
tfidf,clf = SVM(data)

(24000, 42567)
(6000, 42567)
(24000,)
(6000,)
(6000,)
              precision    recall  f1-score   support

           0       0.72      0.72      0.72      3000
           1       0.72      0.73      0.72      3000

    accuracy                           0.72      6000
   macro avg       0.72      0.72      0.72      6000
weighted avg       0.72      0.72      0.72      6000



### Fine Tuning the SVM Model

In [43]:
#Model Training

def SVM_tuned(data):
    X = data["twitts"]
    y = data["sentiment"]

    tfidf = TfidfVectorizer(norm="l1",ngram_range=(1,2),analyzer = "word",max_features = 7000)
#     (1,5),analyzer = "char"
    X = tfidf.fit_transform(X)

    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state =0,test_size=0.2,stratify = y)

    print(X_train.shape)
    print(X_test.shape)
    print(y_train.shape)
    print(y_test.shape)

    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(y_pred.shape)
    
    print(classification_report(y_test,y_pred))
    
    return tfidf,clf

In [44]:
SVM_tuned(data)

(24000, 7000)
(6000, 7000)
(24000,)
(6000,)
(6000,)
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      3000
           1       0.72      0.73      0.72      3000

    accuracy                           0.72      6000
   macro avg       0.72      0.72      0.72      6000
weighted avg       0.72      0.72      0.72      6000



(TfidfVectorizer(max_features=7000, ngram_range=(1, 2), norm='l1'),
 LinearSVC())

### Save and Load Model

In [45]:
import pickle
pickle.dump(clf,open("model.pkl","wb"))
pickle.dump(tfidf,open("tfidf.pkl","wb"))

In [47]:
model = pickle.load(open("model.pkl","rb"))
tfidf = pickle.load(open("tfidf.pkl","rb"))

In [48]:
tfidf.vocabulary_

{'robbiebronniman': 31367,
 'sounds': 34552,
 'like': 21673,
 'great': 15282,
 'night': 26240,
 'damn': 9438,
 'person': 28348,
 'stolde': 35297,
 'wallet': 40290,
 'karma': 19951,
 'come': 8173,
 'bite': 4854,
 'ass': 3258,
 'greetings': 15333,
 'piano': 28565,
 'bench': 4494,
 'photo': 28523,
 'drewryanscott': 11128,
 'love': 22349,
 'haha': 15673,
 'forget': 13718,
 'hugyou': 17111,
 'kissno': 20527,
 'lie': 21605,
 'awesome': 3562,
 'kissthestars': 20528,
 'pretty': 29398,
 'pakidownload': 27766,
 'ito': 18353,
 'reupload': 31074,
 'someother': 34368,
 'site': 33730,
 'mediafire': 23681,
 'hindi': 16589,
 'mgwork': 24028,
 'ang': 2577,
 'mu': 25299,
 'skin': 33791,
 'upset': 39513,
 'lilyroseallen': 21734,
 'big': 4709,
 'pool': 29067,
 'paddling': 27714,
 'able': 1267,
 'manage': 22985,
 'paddle': 27713,
 'surroundings': 35951,
 'luxury': 22612,
 'arianna_skye': 3021,
 'hee': 16236,
 'tweet': 38832,
 'youre': 42283,
 '10001': 37,
 'lol': 22125,
 'happy': 15901,
 'tickets': 37619,
