In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [1]:

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Disha
[nltk_data]     Suryawanshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
dataset = pd.read_csv("training.1600000.processed.noemoticoncsv.csv", encoding = 'latin', header=None)

In [5]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
col_names = ['target' , 'id' , 'date' , 'flag' , 'user' , 'text']
dataset.columns = col_names

In [8]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
dataset.shape

(1048576, 6)

In [9]:
#checking for missing values
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [10]:
# Distribution of tweets
dataset['target'].value_counts()

target
0    800000
4    248576
Name: count, dtype: int64

In [11]:
# Converting 0 to -ve and 4 to +ve
dataset['target'] = dataset['target'].map({0:0 , 4:1})

In [12]:
dataset['target'].value_counts()

target
0    800000
1    248576
Name: count, dtype: int64

In [13]:
# Stemming

stremmer = PorterStemmer()

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) # removing not a-z and A-Z
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [14]:
dataset['text'] = dataset['text'].apply(stemming)

In [15]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see


In [16]:
x = dataset['text']
y = dataset['target']

In [17]:
# splitting the dataset
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

In [18]:
# convert textual data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [19]:
print(x_train)

  (0, 291304)	0.17055998611630738
  (0, 289302)	0.17816844426143663
  (0, 76086)	0.21382091962433425
  (0, 112604)	0.25561610965533327
  (0, 274522)	0.38130310100762155
  (0, 310698)	0.21680337071817443
  (0, 131032)	0.3331036865429313
  (0, 170973)	0.22040622473443117
  (0, 260364)	0.29670667537013146
  (0, 293962)	0.20602385450665414
  (0, 234038)	0.45145133581842173
  (0, 128226)	0.3766757235221846
  (1, 317745)	0.42535785427885203
  (1, 157058)	0.22621289781890544
  (1, 85990)	0.3282439547797006
  (1, 107650)	0.4193685318904471
  (1, 10368)	0.6959054255244007
  (2, 8913)	0.25771109649346824
  (2, 279759)	0.3944812315505615
  (2, 190385)	0.1875718760908149
  (2, 10708)	0.21560150307066792
  (2, 16861)	0.417701223254919
  (2, 18836)	0.6263527172899958
  (2, 155707)	0.3598753860806687
  (3, 47918)	0.8331272413428771
  :	:
  (838856, 107173)	0.23745127839181498
  (838856, 203758)	0.15767922730814166
  (838856, 22324)	0.1683979953360947
  (838856, 255411)	0.2287902259310824
  (838856, 2

In [20]:
# Training the model
model = LogisticRegression()
model.fit(x_train , y_train)

In [21]:
# Testing the model
y_pred = model.predict(x_test)
print(accuracy_score(y_test , y_pred))

0.8324114516775067


In [22]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split() 
    text = [stremmer.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)   
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [25]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive


In [26]:
# Save the model
import pickle
pickle.dump(model , open('model.pkl' , 'wb'))

In [27]:
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))