In [174]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [175]:
from datasets import load_dataset

# Load the sentiment split from TweetEval
dataset = load_dataset("tweet_eval", "sentiment")

# Convert to pandas DataFrame
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()
val_df = dataset['validation'].to_pandas()

# Combine all splits if you want more data
twitter_data = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Map labels to text for clarity (optional)
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
twitter_data['label_text'] = twitter_data['label'].map(label_map)

# Rename columns for compatibility with your code
twitter_data = twitter_data.rename(columns={"text": "text", "label": "target"})


In [176]:
# # extracting the compressed dataset
# from zipfile import ZipFile
# dataset = './sentiment140.zip'

# with ZipFile(dataset,'r') as zip:
#   zip.extractall()
#   print('The dataset is extracted')

Importing the Dependencies

Data Processing

In [177]:
# loading the data from csv file to pandas dataframe
# twitter_data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [178]:
# checking the number of rows and columns
twitter_data.shape

(59899, 3)

In [179]:
# printing the first 5 rows of the dataframe
twitter_data.head()

Unnamed: 0,text,target,label_text
0,"""QT @user In the original draft of the 7th boo...",2,Positive
1,"""Ben Smith / Smith (concussion) remains out of...",1,Neutral
2,Sorry bout the stream last night I crashed out...,1,Neutral
3,Chase Headley's RBI double in the 8th inning o...,1,Neutral
4,@user Alciato: Bee will invest 150 million in ...,2,Positive


In [180]:
# naming the columns and reading the datasets again
# column_names = ['target', 'id', 'data', 'flag', 'user', 'text']
# twitter_data = pd.read_csv('./training.1600000.processed.noemoticon.csv', names=column_names, encoding='ISO-8859-1')

In [181]:
# checking the number of rows and columns
twitter_data.shape

(59899, 3)

Convert the target "4" to "1"

In [182]:
# twitter_data['target'] = twitter_data['target'].replace(4,1)

In [183]:
# checking the distribution of target column
twitter_data['target'].value_counts()

target
1    27479
2    21043
0    11377
Name: count, dtype: int64

0 --> Negative Tweet

1 --> Positive Tweet

**Stemming**

Stemming is a process of reducing a word to it's Root word

example: actor, actress, acting = act

In [184]:
port_stem = PorterStemmer()

In [185]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

Downsample

In [186]:
twitter_data = twitter_data.sample(frac=1, random_state=42).reset_index(drop=True)  # optional shuffle


In [187]:

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /home/jiren/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [188]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming)

In [189]:
twitter_data.head()

Unnamed: 0,text,target,label_text,stemmed_content
0,@user @user BJP may be liking Rahul Gandhi now...,2,Positive,user user bjp may like rahul gandhi rahul spea...
1,The guy who let me borrow his Ohio State bag F...,2,Positive,guy let borrow ohio state bag friday night sai...
2,@user @user will NEVER but the sun Jo\u002c ne...,0,Negative,user user never sun jo u c never even guarante...
3,Friday post-game IHOP is having their 2nd annu...,1,Neutral,friday post game ihop nd annual pancak bonanza...
4,@user For when your working day eventually end...,2,Positive,user work day eventu end david bowi speed life...


In [190]:
print(twitter_data['stemmed_content'])

0        user user bjp may like rahul gandhi rahul spea...
1        guy let borrow ohio state bag friday night sai...
2        user user never sun jo u c never even guarante...
3        friday post game ihop nd annual pancak bonanza...
4        user work day eventu end david bowi speed life...
                               ...                        
59894    user user user alreadi fund black live matter ...
59895    talkin god sunday allah monday wonder meet bud...
59896    st amend ignor new feder right marriag equal m...
59897    usopen venu william middl name eboni starr las...
59898                                         stupid bitch
Name: stemmed_content, Length: 59899, dtype: object


In [191]:
print(twitter_data['target'])

0        2
1        2
2        0
3        1
4        2
        ..
59894    0
59895    1
59896    0
59897    1
59898    0
Name: target, Length: 59899, dtype: int64


In [192]:
# separating the data and label
X = twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [193]:
print(X)

['user user bjp may like rahul gandhi rahul speak bjp gain'
 'guy let borrow ohio state bag friday night said keep saweeeeet good day liz'
 'user user never sun jo u c never even guarante date dave grohl even' ...
 'st amend ignor new feder right marriag equal made scotu marriag'
 'usopen venu william middl name eboni starr last major wimbledon serena th time major tie w henin amp goolagong'
 'stupid bitch']


In [194]:
print(Y)

[2 2 0 ... 0 1 0]


Splitting the data into training data and test data

In [195]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [196]:
print(X.shape, X_train.shape, X_test.shape)

(59899,) (47919,) (11980,)


In [197]:
print(X_test)

['final th attempt anderson win second set serv th ace match lead set love murray'
 'user user askchrishemsworth know chri evan may enjoy diet captain america stronger thor'
 'hey user wait sharknado tomorrow night name charact woo hoo user garymartinhay'
 ...
 'melania trump copi michel obama speech word word plagiar research paper get kick univers'
 'rt user curti painter total go get lucki tonight probabl anoth girl look like'
 'semi us nation cup red star keep suspens th c mon star pic']


In [198]:
# converting the textual data to numerical data

vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

# Save the vectorizer to 'vectorizer.pkl'
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [199]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 507635 stored elements and shape (47919, 34813)>
  Coords	Values
  (0, 10909)	0.5202757800916352
  (0, 19816)	0.404976815419377
  (0, 22724)	0.3742597854815933
  (0, 30148)	0.43993535163915437
  (0, 33150)	0.4813453967498036
  (1, 4540)	0.38782462123566913
  (1, 8666)	0.3038138012428354
  (1, 10040)	0.42050897008017596
  (1, 12035)	0.28181109403337506
  (1, 12935)	0.2934092844289215
  (1, 18088)	0.3786620336864111
  (1, 19888)	0.2507712151028447
  (1, 21008)	0.2719587280546141
  (1, 24872)	0.22420437741325533
  (1, 29877)	0.2906108253566681
  (2, 4568)	0.2911090890901289
  (2, 6956)	0.3116404350198698
  (2, 8689)	0.3380036406575038
  (2, 17266)	0.26367012991483235
  (2, 18669)	0.2627292212812458
  (2, 19101)	0.257158089319805
  (2, 20659)	0.26463682004956773
  (2, 22494)	0.27226560003323924
  (2, 23362)	0.29795929824288747
  (2, 25400)	0.3278536467391181
  :	:
  (47916, 10799)	0.2781710892917679
  (47916, 11827)	0.3392512625

In [200]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 121774 stored elements and shape (11980, 34813)>
  Coords	Values
  (0, 181)	0.3284348081838193
  (0, 1064)	0.2909365064505308
  (0, 1768)	0.3063967632899473
  (0, 10454)	0.21945013624281928
  (0, 17195)	0.2361569387910884
  (0, 17993)	0.18353549157633883
  (0, 18821)	0.24820462406673183
  (0, 20337)	0.2485014369749943
  (0, 26794)	0.2643640171789289
  (0, 26970)	0.29008978114413025
  (0, 26978)	0.43412605859742154
  (0, 30205)	0.26638848538530496
  (0, 33701)	0.19790562533576447
  (1, 946)	0.3004440800940181
  (1, 4644)	0.36296503342633
  (1, 5465)	0.276732659577214
  (1, 7895)	0.39467799957902727
  (1, 9334)	0.2847156731586759
  (1, 9638)	0.3128002166952255
  (1, 16642)	0.21112050576383776
  (1, 18896)	0.14901114534740886
  (1, 29077)	0.4121153472180534
  (1, 30546)	0.2979067243258206
  (1, 32272)	0.20338976303688655
  (2, 5191)	0.3751740872235123
  :	:
  (11978, 1180)	0.2723141515847857
  (11978, 6973)	0.3042687193225553
 

Training the Machine Learning Model

Logistic Regression

In [201]:
model = LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga')
model.fit(X_train, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,1000


Model Evaluation

Accuracy Score

In [202]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [203]:
print('Accuracy on the training data: ', training_data_accuracy)

Accuracy on the training data:  0.7731171351655919


In [204]:
X_test_prediction = model.predict(X_test)
training_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [205]:
print('Accuracy on the test data: ', training_data_accuracy)

Accuracy on the test data:  0.6417362270450752


Saving the trained model

In [206]:
filename = 'trained_model.sav'

In [207]:
import pickle

Using the saved model for future predictions

In [208]:
pickle.dump(model, open(filename, 'wb'))

Using the saved model for future predictions

In [209]:
# Loading the saved model
loaded_model = pickle.load(open(filename, 'rb'))

In [210]:
X_new = X_test[3]
print(Y_test[3])

prediction = loaded_model.predict(X_new)
print(prediction)

if (prediction[0]==2):
  print('Positive')
elif (prediction[0]==1):
  print('Neutral')
else:
  print('Negative')

1
[2]
Positive


In [222]:
# 🔁 Run this cell after your model and vectorizer are trained/loaded

import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Initialize stemmer
port_stem = PorterStemmer()

stop_words = set(stopwords.words('english')) - {'not', 'no', "don't", "isn't", "wasn't", "aren't", "weren't", "won't", "wouldn't", "shouldn't", "couldn't", "doesn't", "didn't", "can't"}
# Function to predict sentiment
def predict_sentiment(input_text):
    # Preprocess
    stemmed_content = re.sub('[^a-zA-Z]', ' ', input_text)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stop_words]
    final_text = ' '.join(stemmed_content)

    # Vectorize and predict
    vectorized_input = vectorizer.transform([final_text])
    prediction = loaded_model.predict(vectorized_input)

    # Result
    if prediction[0] == 2:
        sentiment = "✅ Positive"
    elif prediction[0] == 1:
        sentiment = "😐 Neutral"
    else:
        sentiment = "❌ Negative"
    print(f"Input: {input_text}")
    print(f"Predicted Sentiment: {sentiment}")

# 🧪 Example usage
predict_sentiment("he lives in a newyork city")
predict_sentiment("i play football")
predict_sentiment("i hate this movie")
predict_sentiment("I am married")



Input: he lives in a newyork city
Predicted Sentiment: 😐 Neutral
Input: i play football
Predicted Sentiment: 😐 Neutral
Input: i hate this movie
Predicted Sentiment: ❌ Negative
Input: I am married
Predicted Sentiment: ❌ Negative


In [212]:
twitter_data['target'].value_counts()


target
1    27479
2    21043
0    11377
Name: count, dtype: int64