## Files and Libraries

In [1]:
! pip install kaggle



In [2]:
!mkdir -p ~/ .kaggle
!cp kaggle.json ~/ .kaggle
!chmod 600 ~/ .kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
cp: -r not specified; omitting directory '/root/'
chmod: cannot access '.kaggle/kaggle.json': No such file or directory


In [3]:
# API to fetch the data set from kaggle

!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 94% 76.0M/80.9M [00:00<00:00, 188MB/s]
100% 80.9M/80.9M [00:00<00:00, 160MB/s]


In [4]:
# extracting the zip file

from zipfile import ZipFile
file_name = "/content/sentiment140.zip"

with ZipFile(file_name, 'r') as zip:
  zip.extractall()
  print('done')

done


In [5]:
# importing libraries

import pandas as pd   # data manipulation and analysis
import numpy as np    # for numerical operation on arrays
import re  # for removing unwanted characters, symbols, or patterns
from nltk.corpus import stopwords  # identifying stopwords
from nltk.stem import PorterStemmer   # for stemming, which reduces words to their base or root form (e.g., "running" → "run")
from sklearn.feature_extraction.text import TfidfVectorizer   # for converting text data into numerical vectors
from sklearn.model_selection import train_test_split    # for model training by dividing the data into training and testing
from sklearn.linear_model import LogisticRegression    # for classification tasks
from sklearn.metrics import accuracy_score    # for testing the accuracy of the model

In [6]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# printing the stopwords in english
# stopwords are those words which do not contribute to the sentiment analysis or in other words 'redundant words'
# we will remove these stopwords from our twitter(X) data

print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Data Processing

In [8]:
# loading the dataset
columns = ['target','id','date','flag','user','text']
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv' , names = columns , encoding = 'ISO-8859-1')

In [9]:
df.shape

(1600000, 6)

In [10]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
# missing values

df.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


very good dataset :)

In [12]:
# distribution of target column

df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


:))

In [13]:
# coverting target label 4 ---> 1

df['target'] = df['target'].replace({4:1})

In [14]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


0 ---> negative ; 1 ---> positive

In [15]:
# stemming: reducing to root word (eg: walking, walked, was walking --> walk)

stemmer = PorterStemmer()

In [16]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)  # removing numbers , symbols , whitespaces etc
  stemmed_content = stemmed_content.lower()   # coverting to lowercase
  stemmed_content = stemmed_content.split()   # storing each word in a list
  stemmed_content = [stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]   # stemming
  stemmed_content = ' '.join(stemmed_content)   # joining each word with a space

  return stemmed_content

In [17]:
df['stemmed_content'] = df['text'].apply(stemming)  # 37mins :)

In [19]:
df['stemmed_content']

Unnamed: 0,stemmed_content
0,switchfoot http twitpic com zl awww bummer sho...
1,upset updat facebook text might cri result sch...
2,kenichan dive mani time ball manag save rest g...
3,whole bodi feel itchi like fire
4,nationwideclass behav mad see
...,...
1599995,woke school best feel ever
1599996,thewdb com cool hear old walt interview http b...
1599997,readi mojo makeov ask detail
1599998,happi th birthday boo alll time tupac amaru sh...


In [20]:
df['target']

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
1599995,1
1599996,1
1599997,1
1599998,1


In [21]:
# we only need target and stemmed_content others are redundant

X = df['stemmed_content'].values
Y = df['target'].values

## MODEL

In [26]:
# train_test_split

X_train , X_test , Y_train , Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=42)

In [27]:
print(X_train)

['paisleypaisley lol get idea far advanc even june yet need third knitter summer group'
 'worst headach ever'
 'ewaniesciuszko sad wont see miss alreadi yeah perfect come back th' ...
 'got home meet talk endlessli one coolest guy ever met smile'
 'bought chocol bar quot win free bar quot label win either'
 'misecia said hope dm email sunday']


In [28]:
print(X_test)

['stm denali ye black red fav color realli want color def look awesom jare'
 'qu buy open hous weekend pm best valu one bedroom lic long island citi bd http tinyurl com pt nqd'
 'ginoandfran fran greet air okay hahahaha thank' ...
 'la brat follow also hope atleast get also wish get well soon'
 'feel like decent swell sinc last fall hope wave myrtl beach week either least golf'
 'relaxin busi day']


In [29]:
# coverting textdata to numerical using vector

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [30]:
print(X_train)

  (0, 307108)	0.46206048815324474
  (0, 239679)	0.15130037108228483
  (0, 146067)	0.12929728405657018
  (0, 175252)	0.224070805470346
  (0, 128605)	0.22108856600702773
  (0, 4832)	0.317074267861159
  (0, 124524)	0.18318401951949756
  (0, 205794)	0.24140229063801746
  (0, 454381)	0.20169626473577715
  (0, 286478)	0.16123218610004272
  (0, 406297)	0.2978221095272138
  (0, 220296)	0.43015677907624866
  (0, 388138)	0.20555120011808467
  (0, 154767)	0.26976607043258233
  (1, 445870)	0.6361096685891185
  (1, 161801)	0.5778049407933611
  (1, 124611)	0.5113765148324884
  (2, 125319)	0.6383069130836649
  (2, 349409)	0.22232944888223494
  (2, 444761)	0.30331529032956345
  (2, 358186)	0.19837942712286838
  (2, 267649)	0.19309660201644555
  (2, 12436)	0.2529872032123258
  (2, 453420)	0.2347069337186747
  (2, 312657)	0.3154702974657607
  :	:
  (1279997, 124611)	0.2537781914144255
  (1279997, 301683)	0.1908678391932645
  (1279997, 168646)	0.20490659397970187
  (1279997, 156392)	0.23570311036994007
 

In [31]:
print(X_test)

  (0, 28874)	0.1778395103911245
  (0, 43712)	0.23562815302828183
  (0, 78636)	0.5158100011206617
  (0, 96399)	0.255967788489452
  (0, 97585)	0.4019235611854435
  (0, 129417)	0.25650960779862714
  (0, 189057)	0.31324918577405797
  (0, 240451)	0.15341308097014625
  (0, 334643)	0.14719329779308424
  (0, 335577)	0.22602158147814247
  (0, 384697)	0.3281164007446601
  (0, 435956)	0.14183025329879742
  (0, 453357)	0.1781708363247895
  (1, 35118)	0.3128685670821343
  (1, 36669)	0.2729958846742327
  (1, 39445)	0.16867093960211466
  (1, 57115)	0.19043301054662504
  (1, 74274)	0.21148120876702692
  (1, 78790)	0.13386322067407883
  (1, 170374)	0.17525273735418329
  (1, 171245)	0.12468774856570086
  (1, 183279)	0.24586158827112847
  (1, 233854)	0.3852709938491561
  (1, 240223)	0.1674195650536303
  (1, 301683)	0.13212235134015302
  :	:
  (319997, 135536)	0.21809964977532384
  (319997, 146067)	0.31485892246365776
  (319997, 169469)	0.1945044482645627
  (319997, 225096)	0.2933179603743362
  (319997, 3

In [32]:
model = LogisticRegression(max_iter=1000)

In [33]:
model.fit(X_train , Y_train)

In [39]:
# evaluating the accuracy on training data:

X_train_prediction = model.predict(X_train)
train_accuracy = accuracy_score(X_train_prediction , Y_train)

In [40]:
print("Accuracy on trainig data : ",train_accuracy)

Accuracy on trainig data :  0.7999984375


In [41]:
# evaluating the accuracy on test data:

X_test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction , Y_test)

In [42]:
print("Accuracy on trainig data : ",train_accuracy)

Accuracy on trainig data :  0.7999984375


crazy good model
accuracy: 79.99%

## Saving the model

In [43]:
import pickle

In [44]:
filename = 'sentiment_model.sav'
pickle.dump(model , open(filename,'wb'))

## Use the saved model

In [45]:
saved_model = pickle.load(open('/content/sentiment_model.sav' , 'rb'))

In [48]:
# correct prediction

X_new = X_test[100]
print(Y_test[100])

prediction = saved_model.predict(X_new)
print(prediction)

if prediction[0] == 0:
  print("Negative")
else:
  print("Positive")

1
[1]
Positive


In [49]:
#incorrect prediction

X_new = X_test[0]
print(Y_test[0])

prediction = saved_model.predict(X_new)
print(prediction)

if prediction[0] == 0:
  print("Negative")
else:
  print("Positive")

0
[1]
Positive
