In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression



In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [4]:
dataset = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [5]:
dataset.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [6]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
dataset.columns = col_names
dataset.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [7]:
dataset.shape


(1599999, 6)

In [8]:
dataset.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [9]:
dataset['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

In [10]:
dataset['target'] = dataset['target'].map({4:1,0:0})


In [11]:
dataset['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

In [12]:
# Stemming

stremmer = PorterStemmer()
stop_words = set(stopwords.words('english'))  # ✅ Load stopwords once

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stremmer.stem(word) for word in stemmed_content if word not in stop_words]  # ✅ use preloaded stop_words
    return ' '.join(stemmed_content)


In [13]:
dataset['text'] = dataset['text'].apply(stemming)

In [14]:
dataset.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result sch...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest g...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass behav mad see
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,kwesidei whole crew


In [15]:
x = dataset['text']
y = dataset['target']

In [16]:
# splitting the dataset
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 0)

In [17]:
# convert textual data to numerical data
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [18]:
print(x_train)

  (0, 145591)	0.48328892862950174
  (0, 384310)	0.38648598535906226
  (0, 160355)	0.18966194768681632
  (0, 422796)	0.4213995220282958
  (0, 246262)	0.516206150117446
  (0, 393595)	0.18633353695642413
  (0, 149660)	0.12602103676347354
  (0, 150562)	0.187752051036393
  (0, 443991)	0.22625223143666687
  (1, 172128)	0.6067414559564506
  (1, 418051)	0.7948992424350689
  (2, 406965)	0.6931768888241752
  (2, 275790)	0.3769717187165907
  (2, 290673)	0.24841016587340456
  (2, 150650)	0.20986098127991223
  (2, 42279)	0.5211994648067829
  (3, 175231)	0.30748407834013664
  (3, 89478)	0.5137960384023271
  (3, 135304)	0.18399221471225605
  (3, 292469)	0.3352332134067401
  (3, 399931)	0.21912347276618377
  (3, 317428)	0.5137960384023271
  (3, 175234)	0.4280552121498152
  (4, 408579)	0.14704998873675024
  (4, 300289)	0.2058593651486058
  :	:
  (1279995, 101591)	0.8081360486674279
  (1279995, 248952)	0.5889958631808858
  (1279996, 277402)	0.6930282733228941
  (1279996, 133848)	0.34541074396262944
  (1

In [19]:
# Training the model
model = LogisticRegression()
model.fit(x_train , y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# Testing the model
y_pred = model.predict(x_test)
print(accuracy_score(y_test , y_pred))

0.775875


In [33]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]', ' ', text)  # removing non-alphabet characters
    text = text.lower()
    text = text.split()
    
    # ✅ START: optimized stopwords usage with preloaded list
    text = [stremmer.stem(word) for word in text if word not in stop_words]
    # ✅ END: optimized stopwords usage
    
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)   
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"


In [34]:
# Testing the model
print(predict_sentiment("I hate you"))
print(predict_sentiment("I love you"))

Negative
Positive


In [35]:
# Save the model
import pickle
pickle.dump(model , open('model.pkl' , 'wb'))

In [36]:
pickle.dump(vectorizer , open('vectorizer.pkl' , 'wb'))