## Importing libraries

In [2]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\office\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# printing stopwords in english
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Data processing

In [6]:
# loading dataset
twitter_data = pd.read_csv('dataset.csv',encoding='ISO-8859-1', names=['target', 'id', 'date', 'flag', 'user', 'text'] )
twitter_data.sample(10)

Unnamed: 0,target,id,date,flag,user,text
957030,4,1825369471,Sun May 17 05:26:44 PDT 2009,NO_QUERY,AstroMeg,Quarry was fun. Got some gear stuck and had to...
211752,0,1974567876,Sat May 30 13:23:56 PDT 2009,NO_QUERY,veence,Showered and getting ready for work. fail.
356652,0,2044358377,Fri Jun 05 09:14:30 PDT 2009,NO_QUERY,chrisssti,"@lukesterluke , I can't! My toe is f'd up, can..."
299405,0,1997999919,Mon Jun 01 17:49:27 PDT 2009,NO_QUERY,GaByAFS,"i want to enter in my old my space , but i fo..."
65053,0,1690792816,Sun May 03 16:23:27 PDT 2009,NO_QUERY,SirJHenry,@Kikisings that's how you feel?!?!!!??? Wow......
678448,0,2248977225,Fri Jun 19 21:30:26 PDT 2009,NO_QUERY,lenasaur,@KiraKiraStudio im a vegan so i probably would...
1036413,4,1956340775,Thu May 28 21:40:19 PDT 2009,NO_QUERY,davidfromant,iPhone Doom on the way http://tinyurl.com/qljx...
1040050,4,1956897989,Thu May 28 22:58:38 PDT 2009,NO_QUERY,Jazzy_J21,"Someone's typing, i don't even have to @ reply..."
520457,0,2192116986,Tue Jun 16 06:32:09 PDT 2009,NO_QUERY,KMDeSilvio,@nannypalooza I had that this weeked it was t...
1339023,4,2018421085,Wed Jun 03 09:56:30 PDT 2009,NO_QUERY,JotaCapo,Lunching with the hottest redhead on TV.


In [41]:
Specific_Rows = twitter_data.sample(n=500000, random_state=42) # Select 40 random rows of your dataset
Specific_Rows

Unnamed: 0,target,id,date,flag,user,text
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo..."
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem
...,...,...,...,...,...,...
1085389,1,1969163759,Fri May 29 23:12:12 PDT 2009,NO_QUERY,Izaaza,is going out with Katuty. some pampering to do~
753135,0,2286778000,Mon Jun 22 16:54:48 PDT 2009,NO_QUERY,sega_123,jeanette is sick. and taking a nap. but her b...
466554,0,2175587755,Mon Jun 15 01:24:51 PDT 2009,NO_QUERY,din_heima,wanna make egg custard with coconut milk but t...
1471328,1,2065195932,Sun Jun 07 08:05:55 PDT 2009,NO_QUERY,AliciaSanera,@employerbrander Shh... I read the paper on Su...


In [43]:
twitter_data.shape

(1600000, 6)

In [45]:
twitter_data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [47]:
# checking the distribution of target column
Specific_Rows['target'].value_counts()

target
1    250625
0    249375
Name: count, dtype: int64

## Convert "4" to '1'

In [35]:
Specific_Rows.replace({'target':{4:1}}, inplace=True)

In [37]:
Specific_Rows['target'].value_counts()

target
1    50057
0    49943
Name: count, dtype: int64

* '0' : Negative tweet
* '1' : Positive tweet

## Steming:
Steming is the process of reducing a word to its root word.

example: actor, actress, acting -> act

In [16]:
port_stem = PorterStemmer()

In [17]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english') ]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [49]:
Specific_Rows['stemmed_content'] = Specific_Rows['text'].apply(stemming)

In [53]:
Specific_Rows.sample(10)

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
597849,0,2219371120,Thu Jun 18 00:23:07 PDT 2009,NO_QUERY,clarissaharrisa,Feels like friday... But it isn't!,feel like friday
1585229,1,2190655392,Tue Jun 16 03:21:13 PDT 2009,NO_QUERY,alyi,@mabel1922 have a good day,mabel good day
447156,0,2068623654,Sun Jun 07 14:24:20 PDT 2009,NO_QUERY,kateweb,@misswiz http://twitpic.com/6up0q - Sorry Sara...,misswiz http twitpic com q sorri sarah gave re...
873893,1,1679805312,Sat May 02 10:13:13 PDT 2009,NO_QUERY,mandirao,it's derby day!! time to party down.,derbi day time parti
659859,0,2242329067,Fri Jun 19 12:03:21 PDT 2009,NO_QUERY,mahealanij,@jdubb4113.. they seperated us today..,jdubb seper us today
1251380,1,1996497498,Mon Jun 01 15:11:03 PDT 2009,NO_QUERY,kaidano,@adeator Ð¼Ð°ÑÐ³Ð°ÑÐ¸Ð½ - Ð½Ð°ÑÐµ Ð²Ñ?Ñ,adeat
447414,0,2068696640,Sun Jun 07 14:32:07 PDT 2009,NO_QUERY,KellyStutts,Photo: thedailywhat: Upcycle??? Blerg http://...,photo thedailywhat upcycl blerg http tumblr co...
1318095,1,2014338287,Wed Jun 03 01:11:58 PDT 2009,NO_QUERY,advera,It's 4 AM and I'm still awake. Late night watc...,still awak late night watch quot babi mama quo...
1116709,1,1973295398,Sat May 30 10:52:21 PDT 2009,NO_QUERY,katewinney,lovely shooping day got a nice new darling dre...,love shoop day got nice new darl dress new sum...
71541,0,1693990187,Mon May 04 00:35:42 PDT 2009,NO_QUERY,BasiaZoe,@iamjonathancook I can't fall asleep! Ahhh. It...,iamjonathancook fall asleep ahhh suck


## Seperating data and label

In [56]:
X = Specific_Rows['stemmed_content'].values

In [58]:
Y = Specific_Rows['target'].values

In [62]:
Y

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

## Spliting training and tesing data

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [67]:
X.shape, X_train.shape, X_test.shape

((500000,), (400000,), (100000,))

## Feature Engineering

In [70]:
# converting the texual data to numerical data
Vectorizer = TfidfVectorizer()

X_train = Vectorizer.fit_transform(X_train)
X_test = Vectorizer.transform(X_test)

In [76]:
print(X_train)

  (0, 75621)	0.8728863501974954
  (0, 171788)	0.3481980399654188
  (0, 159304)	0.34180044558650874
  (1, 205764)	0.9227704122338368
  (1, 48918)	0.385350186591099
  (2, 188106)	0.39052251077263345
  (2, 81891)	0.25012294170421795
  (2, 57035)	0.29193121990631304
  (2, 57575)	0.3277910998937665
  (2, 208535)	0.5846137017084795
  (2, 146385)	0.34242745487206755
  (2, 206261)	0.3650069837853254
  (3, 126984)	0.5156350659480482
  (3, 135047)	0.5736715555280697
  (3, 176845)	0.4542388449714272
  (3, 791)	0.35815362617287255
  (3, 169322)	0.26535726280055477
  (4, 36995)	0.2647957461952136
  (4, 71043)	0.6128602414366693
  (4, 60493)	0.4669761897770744
  (4, 87362)	0.2720184302465531
  (4, 100677)	0.2932958670013655
  (4, 175556)	0.2107675810177029
  (4, 181394)	0.21759952949768654
  (4, 57977)	0.18806870567074827
  :	:
  (399997, 137359)	0.1897627663868365
  (399997, 36087)	0.1920578377380143
  (399997, 27590)	0.24127801306666147
  (399997, 1732)	0.29857125133452234
  (399997, 58261)	0.2968

In [78]:
print(X_test)

  (0, 69255)	0.2329891748216294
  (0, 69734)	0.2572730944043701
  (0, 77577)	0.2830100548120052
  (0, 103983)	0.3585912293028293
  (0, 116932)	0.34995925389320603
  (0, 130457)	0.27721348653352856
  (0, 147204)	0.3503578101333757
  (0, 174201)	0.31792110210405994
  (0, 187747)	0.24920467248418815
  (0, 203639)	0.23673207352024733
  (0, 203894)	0.3599388507048128
  (1, 20033)	0.38859098038853324
  (1, 48918)	0.26564279450084816
  (1, 59683)	0.21491141701542965
  (1, 69255)	0.18721839021432282
  (1, 78220)	0.29039169926474406
  (1, 118909)	0.4297821402363101
  (1, 124108)	0.3406796516266077
  (1, 185403)	0.2151576655437813
  (1, 199583)	0.4791783701225989
  (1, 203639)	0.19022599548021984
  (2, 6429)	0.3764774668472164
  (2, 32130)	0.5362502829991824
  (2, 91460)	0.6096119157471427
  (2, 106927)	0.19001136519699197
  :	:
  (99997, 36933)	0.34014918588517584
  (99997, 57035)	0.15732397857258973
  (99997, 61247)	0.543969924463299
  (99997, 62497)	0.2469680928266285
  (99997, 64521)	0.15209

## Training the ML Model

In [81]:
model = LogisticRegression(max_iter=1000)

In [83]:
model.fit(X_train, Y_train)

## Model Evaluation

In [86]:
# accuracy score on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [88]:
print("accuracy on training data:", training_data_accuracy)

accuracy on training data: 0.81025


In [90]:
# accuracy score on training data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [94]:
print("accuracy on test data:", testing_data_accuracy)

accuracy on test data: 0.77145


## Saving the trained model

In [97]:
import pickle

In [99]:
filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

## Using the saved model for future predictions

In [104]:
# loading the saved model
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [108]:
X_new = X_test[200]
print(Y_test[200])

1


In [110]:
prediction = loaded_model.predict(X_new)

if prediction[0] == 0:
    print("Negative Tweet")
else:
    print("Positive Tweet")

Positive Tweet


In [112]:
X_new = X_test[20]
print(Y_test[20])

prediction = loaded_model.predict(X_new)

if prediction[0] == 0:
    print("Negative Tweet")
else:
    print("Positive Tweet")

0
Negative Tweet
