In [192]:
# extracting the compressed dataset
from zipfile import ZipFile
dataset = './sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


Importing the Dependencies

In [193]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Processing

In [194]:
# loading the data from csv file to pandas dataframe
twitter_data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [195]:
# checking the number of rows and columns
twitter_data.shape

(1599999, 6)

In [196]:
# printing the first 5 rows of the dataframe
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [197]:
# naming the columns and reading the datasets again
column_names = ['target', 'id', 'data', 'flag', 'user', 'text']
twitter_data = pd.read_csv('./training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [198]:
# checking the number of rows and columns
twitter_data.shape

(1600000, 6)

Convert the target "4" to "1"

In [199]:
twitter_data['target'] = twitter_data['target'].replace(4,1)

In [200]:
# checking the distribution of target column
twitter_data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

0 --> Negative Tweet

1 --> Positive Tweet

**Stemming**

Stemming is a process of reducing a word to it's Root word

example: actor, actress, acting = act

In [201]:
port_stem = PorterStemmer()

In [202]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

Downsample

In [203]:
twitter_data = twitter_data.sample(frac=1, random_state=42).reset_index(drop=True)  # optional shuffle


In [204]:

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /home/jiren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [None]:
twitter_data.head()

Unnamed: 0,target,id,data,flag,user,text,stemmed_content
0,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,chrishasboob ahhh hope ok
1,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",misstoriblack cool tweet app razr
2,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,tiannachao know famili drama lame hey next tim...
3,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email open geographi stuff revis stupid...
4,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem


In [None]:
print(twitter_data['stemmed_content'])

0                                  chrishasboob ahhh hope ok
1                          misstoriblack cool tweet app razr
2          tiannachao know famili drama lame hey next tim...
3          school email open geographi stuff revis stupid...
4                                       upper airway problem
                                 ...                        
1599995                   song middl chang want born arghhhh
1599996                              officialnjona good luck
1599997                        proudgamertweet rather averag
1599998    pickin misstinayao waitin sadittysash hurri od...
1599999                home studi math wooot im go fail shit
Name: stemmed_content, Length: 1600000, dtype: object


In [None]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    0
1599996    1
1599997    0
1599998    0
1599999    0
Name: target, Length: 1600000, dtype: int64


In [None]:
# separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [None]:
print(X)

['chrishasboob ahhh hope ok' 'misstoriblack cool tweet app razr'
 'tiannachao know famili drama lame hey next time u hang kim n u guy like sleepov whatev ill call u'
 ... 'proudgamertweet rather averag'
 'pickin misstinayao waitin sadittysash hurri odeee miss dem tabl talk nite lol bout fat'
 'home studi math wooot im go fail shit']


In [None]:
print(Y)

[0 0 0 ... 0 0 0]


Splitting the data into training data and test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

(1600000,) (1280000,) (320000,)


In [None]:
print(X_test)

['still feel wide awak hour sleep best get work'
 'dear santa pleas pretti beach new board christma thankyou sign best behav person ever'
 'pack ugh dont wanna get morn' ...
 'catep lol know wine lol could interest'
 'dragynfir wish feel better hang thing improv'
 'ultragrrrl rule got http bit ly nvj sent ton extra like horror cd etc check mix cd']


In [None]:
# converting the textual data to numerical data

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

# Save the vectorizer to 'vectorizer.pkl'
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9454833 stored elements and shape (1280000, 460969)>
  Coords	Values
  (0, 93611)	0.19712670050214007
  (0, 149395)	0.18447183478516588
  (0, 241563)	0.2110624397412386
  (0, 278353)	0.25164976600669997
  (0, 298109)	0.35166904553199646
  (0, 304569)	0.3394405182935559
  (0, 317619)	0.286116662289161
  (0, 398223)	0.5525673624700459
  (0, 410848)	0.21915516636920856
  (0, 442499)	0.3809499224592277
  (1, 39402)	0.31334315133391644
  (1, 99994)	0.8051932339421155
  (1, 295381)	0.503467899201099
  (2, 93611)	0.13636640902264432
  (2, 107108)	0.2866934526031088
  (2, 150374)	0.1416088249745466
  (2, 161348)	0.38074949920571793
  (2, 243387)	0.22603114807663133
  (2, 248660)	0.16855806711857987
  (2, 279385)	0.31539044541973377
  (2, 365642)	0.28790372922513674
  (2, 376214)	0.18615943842718097
  (2, 406612)	0.189414744917268
  (2, 416570)	0.5226849566984246
  (2, 419932)	0.3285303653209985
  :	:
  (1279997, 216052)	0.2772284660

In [None]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2286864 stored elements and shape (320000, 460969)>
  Coords	Values
  (0, 28719)	0.41346846624242845
  (0, 39402)	0.3372797545292648
  (0, 129790)	0.2694803233409314
  (0, 145560)	0.22402381445647274
  (0, 169874)	0.3234348698066208
  (0, 370943)	0.2982134461887321
  (0, 383974)	0.2785622341788851
  (0, 441380)	0.5143275079321552
  (0, 445147)	0.2383459718040832
  (1, 35308)	0.24255928403231725
  (1, 37217)	0.35932268326187655
  (1, 39402)	0.20514228463166576
  (1, 46460)	0.2912400462251187
  (1, 72514)	0.33633719047170857
  (1, 94997)	0.25909048431053644
  (1, 124173)	0.21349011254283942
  (1, 287743)	0.1712869670823202
  (1, 312548)	0.2406195451927508
  (1, 317864)	0.20594571459549646
  (1, 322467)	0.21466997142566333
  (1, 351722)	0.3424841363946732
  (1, 367011)	0.2558532778923624
  (1, 400079)	0.31753639246933385
  (2, 106301)	0.4064993724992703
  (2, 145560)	0.27420339255295056
  :	:
  (319997, 220169)	0.23575953444561

Training the Machine Learning Model

Logistic Regression

In [None]:
model = LogisticRegression(max_iter=100)
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Model Evaluation

Accuracy Score

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [None]:
print('Accuracy on the training data: ', training_data_accuracy)

Accuracy on the training data:  0.789184375


In [None]:
X_test_prediction = model.predict(X_test)
training_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [None]:
print('Accuracy on the test data: ', training_data_accuracy)

Accuracy on the test data:  0.77674375


Saving the trained model

In [None]:
filename = 'trained_model.sav'

In [None]:
import pickle

Using the saved model for future predictions

In [None]:
pickle.dump(model, open(filename, 'wb'))

Using the saved model for future predictions

In [None]:
# Loading the saved model
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
X_new = X_test[3]
print(Y_test[3])

prediction = loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('Negative Tweet')
elif (prediction[0]==1):
  print('Neutral Tweet')
else:
  print('Positive Tweet')

0
[0]
Negative Tweet


In [None]:
# 🔁 Run this cell after your model and vectorizer are trained/loaded

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Initialize stemmer
port_stem = PorterStemmer()

# Function to predict sentiment
def predict_sentiment(input_text):
    # Preprocess
    stemmed_content = re.sub('[^a-zA-Z]', ' ', input_text)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    final_text = ' '.join(stemmed_content)

    # Vectorize and predict
    vectorized_input = vectorizer.transform([final_text])
    prediction = loaded_model.predict(vectorized_input)

    # Result
    sentiment = "✅ Positive" if prediction[0] == 1 else "❌ Negative"
    print(f"Input: {input_text}")
    print(f"Predicted Sentiment: {sentiment}")

# 🧪 Example usage
predict_sentiment("I liked this product")
predict_sentiment("This is the worst experience ever")



Input: I liked this product
Predicted Sentiment: ✅ Positive
Input: This is the worst experience ever
Predicted Sentiment: ❌ Negative
