In [2]:
pip install kaggle

Collecting kaggle
  Using cached kaggle-1.6.14-py3-none-any.whl
Installing collected packages: kaggle
Successfully installed kaggle-1.6.14
Note: you may need to restart the kernel to use updated packages.


In [3]:
! kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /Users/Eshita/Desktop/Stuff/Github.nosync/Machine-Learning/ML Projects/Twitter Sentiment Analysis
100%|██████████████████████████████████████| 80.9M/80.9M [00:13<00:00, 6.71MB/s]
100%|██████████████████████████████████████| 80.9M/80.9M [00:13<00:00, 6.10MB/s]


In [4]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Eshita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
columns = ['target', 'id', 'date', 'flag', 'user', 'text']
data = pd.read_csv('training.1600000.processed.noemoticon.csv', names = columns, encoding = 'ISO-8859-1')

In [8]:
data.shape

(1600000, 6)

In [9]:
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
data.tail()

Unnamed: 0,target,id,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


#### Data Pre - Processing

In [11]:
data.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [12]:
data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [13]:
# converting target value of 4 to 1
data.replace({'target': {4:1}}, inplace = True)

In [14]:
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

##### Stemming

In [15]:
port_stem = PorterStemmer()

In [16]:
def stemming(content):
    stemmed_content = re.sub('^a-zA-Z', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english') ]
    stemmed_content = ' '.join(stemmed_content)

    return stemmed_content

In [17]:
data['stemmed_content'] = data['text'].apply(stemming) 

In [18]:
data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","@switchfoot http://twitpic.com/2y1zl - awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,@kenichan dive mani time ball. manag save 50% ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","@nationwideclass no, behav all. i'm mad. here?..."


In [19]:
X = data['stemmed_content'].values
Y = data['target'].values

In [20]:
print(X)

["@switchfoot http://twitpic.com/2y1zl - awww, that' bummer. shoulda got david carr third day it. ;d"
 "upset can't updat facebook text it... might cri result school today also. blah!"
 '@kenichan dive mani time ball. manag save 50% rest go bound' ...
 'readi mojo makeover? ask detail'
 'happi 38th birthday boo alll time!!! tupac amaru shakur'
 'happi #charitytuesday @thenspcc @sparkschar @speakinguph4h']


In [21]:
print(Y)

[0 0 0 ... 1 1 1]


##### Tfidf-Vectorizer

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 1)

In [23]:
print(X_train)

["rise shine lol. i'm min later plan rush i'll door time"
 '2nd interview today. look promising.' '@emilyalbracht feel pain!' ...
 '@bookwitt welcom chang mind though let know'
 '@howcoza bet bring backup!'
 'window linux box, instal bsd appl ipod. yeah.']


In [24]:
# converting textual data to numeric data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [25]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [26]:
print(X_train[5])

  (0, 492272)	0.4260417552558489
  (0, 292078)	0.23334836938918666
  (0, 535426)	0.39342130901245326
  (0, 489415)	0.2954537576326285
  (0, 82581)	0.32240803358749187
  (0, 311535)	0.2773638880015485
  (0, 457691)	0.21529257653965606
  (0, 365644)	0.21587790850491576
  (0, 517407)	0.49812837764474077


#### Sentiment Analysis Using Logistic Regression

In [27]:
clf = LogisticRegression()

In [28]:
clf.fit(X_train, Y_train)

In [29]:
y_train_pred = clf.predict(X_train)
print('Accuracy score on training data:', accuracy_score(Y_train, y_train_pred))

Accuracy score on training data: 0.80768125


In [30]:
y_test_pred = clf.predict(X_test)
print('Accuracy score on testing data:', accuracy_score(Y_test, y_test_pred))

Accuracy score on testing data: 0.78173125


#### Saving the trained model

In [31]:
import pickle

filename = 'trained_model.sav'
pickle.dump(clf, open(filename, 'wb'))

#### Using the pre-trained model

In [32]:
pre_trained = pickle.load(open('trained_model.sav', 'rb'))

In [33]:
x_test = X_test[5]
y_test = Y_test[5]

In [34]:
print(y_test)

0


In [35]:
pred = pre_trained.predict(x_test)
if(pred[0] == 0):
    print("Negative Tweet")
else:
    print("Positive Tweet")

Negative Tweet


#### Sentiment Analysis Using Logistic Regression Using Naive Bayes

In [36]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, Y_train)

In [37]:
y_pred = nb.predict(X_test)

In [38]:
print('Accuracy score on testing data:', accuracy_score(Y_test, y_pred))

Accuracy score on testing data: 0.758965625
