In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [69]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [70]:
stop_words = stopwords.words('english')

In [71]:
# List of common negative words
negative_words_set = {
    "no", "not", "nor", "never", "neither", "nobody", "none", "nowhere",
    "don't", "doesn't", "didn't", "can't", "couldn't", "won't", "wouldn't",
    "shouldn't", "isn't", "aren't", "wasn't", "weren't", "hasn't", "haven't",
    "hadn't", "mustn't", "mightn't", "needn't", "shan't"
}

# Extract non-negative stop words
non_negative_stopwords = [word for word in stop_words if word not in negative_words_set]

print(non_negative_stopwords)
print("Total non-negative stop words:", len(non_negative_stopwords))


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'd', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', 'has', 'hasn', 'have', 'haven', 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', 'more', 'most', 'mustn', 'my', 'myself', 'needn', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', 'she', "she'd", "she'll", "she's", 'should', 'shouldn', "should've", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs', 'them', 'th

In [72]:
data = pd.read_csv('/content/drive/MyDrive/Gen AI GL/Gen AI Week 1/Sentiment Analysis Project /data/training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [73]:
df = data.copy()
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [74]:
df.shape

(1599999, 6)

In [75]:
new_cols = ['target','ids','date','flag','user','text']

# Step 1: Convert existing column names into a DataFrame row
first_row = pd.DataFrame([df.columns.tolist()], columns=df.columns)

# Step 2: Add this row at the top of the existing data
df = pd.concat([first_row, df], ignore_index=True)

# Step 3: Assign your new list of column names
df.columns = new_cols

print(df.head())


  target         ids                          date      flag             user  \
0      0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1      0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2      0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3      0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4      0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   

                                                                                                                  text  
0  @switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  
1      is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!  
2                            @Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds  
3                            

In [76]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [78]:
df.shape

(1600000, 6)

In [79]:
df.isnull().sum()

Unnamed: 0,0
target,0
ids,0
date,0
flag,0
user,0
text,0


In [80]:
df.duplicated().sum()

np.int64(0)

In [81]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
4,800000
0,799999
0,1


In [82]:
df['target'].unique()

array(['0', 0, 4], dtype=object)

In [83]:
df['target'] = df['target'].map({0:0,'0':0,4:1})
df['target'].unique()

array([0, 1])

In [84]:
df['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


In [104]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
pattern1 = re.compile('[^a-zA-Z ]+')
pattern2 = re.compile('\bhttp\w+\b')

def stemming(content):
    content = pattern1.sub('', content)  # faster precompiled regex
    content = pattern2.sub(' ',content)
    tokens = content.lower().split()
    tokens = [stemmer.stem(word) for word in tokens if word not in non_negative_stopwords]
    return ' '.join(tokens)


  pattern2 = re.compile('\bhttp\w+\b')


In [105]:
df['text'] = df['text'].map(stemming)


In [87]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,switchfoot http twitpic com zl awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset updat facebook text might cri result school today also blah
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,kenichan dive mani time ball manag save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,nationwideclass no not behav mad see


In [88]:
x = df['text']
y = df['target']

In [89]:
# splitting the dataset
xtrain , xtest , ytrain , ytest = train_test_split(x , y , test_size = 0.2 , random_state = 0)

# convert textual data to numerical data
vectorizer = TfidfVectorizer()
xtrain = vectorizer.fit_transform(xtrain)
xtest = vectorizer.transform(xtest)

In [90]:
print(xtrain)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9596349 stored elements and shape (1280000, 461608)>
  Coords	Values
  (0, 382788)	0.5218568045404242
  (0, 297311)	0.6232356327780383
  (0, 408011)	0.3371370780764098
  (0, 234856)	0.18185821646616165
  (0, 426791)	0.2679643799852714
  (0, 315999)	0.2756501589509843
  (0, 438903)	0.2114735969568524
  (1, 171471)	0.19809776536935508
  (1, 420894)	0.25656200686096986
  (1, 78784)	0.21269612474241953
  (1, 445811)	0.44918210875315057
  (1, 40016)	0.25216535956536873
  (1, 81860)	0.6901664688997386
  (1, 266187)	0.3286516093502419
  (2, 146267)	0.14182057590573882
  (2, 445823)	0.15086623694460702
  (2, 454731)	0.22122021129402739
  (2, 406452)	0.17049803826103221
  (2, 280634)	0.23782486392804178
  (2, 392427)	0.3146223146303898
  (2, 143199)	0.34048926092844567
  (2, 125793)	0.22415259519057731
  (2, 318193)	0.24073956626652676
  (2, 127779)	0.3654548377329762
  (2, 234605)	0.2813845455072357
  :	:
  (1279997, 376923)	0.17628

In [91]:
model = LogisticRegression()

model.fit(xtrain,ytrain)

ypred_test = model.predict(xtest)

print(accuracy_score(ytest, ypred_test))

0.78513125


In [92]:
# Function to predict the sentiment
def predict_sentiment(text):
    text = re.sub('[^a-zA-Z]',' ',text) # removing not a-z and A-Z
    text = text.lower()
    text = text.split()
    text = [stemmer.stem(word) for word in text if not word in non_negative_stopwords]
    text = ' '.join(text)
    text = [text]
    text = vectorizer.transform(text)
    sentiment = model.predict(text)
    if sentiment == 0:
        return "Negative"
    else:
        return "Positive"

In [118]:
import pandas as pd

# Example: assume you have y_test and y_pred as lists/arrays/Series

# Convert to pandas Series (if not already)
y_test = pd.Series(ytest)
y_pred = pd.Series(ypred_test)

# Proportion of actual & predicted classes
df_prop = pd.DataFrame({
    "actual_proportion": round(y_test.value_counts(normalize=True),3),
    "predicted_proportion": round(y_pred.value_counts(normalize=True),3)
})

df_prop


Unnamed: 0,actual_proportion,predicted_proportion
1,0.501,0.519
0,0.499,0.481


In [93]:
print(predict_sentiment('The sun is shining brightly today.'))

Positive


In [95]:
print(predict_sentiment('Just get out brother. I dont need you here'))

Negative


In [106]:
print(predict_sentiment('Iâ€™m really disappointed with the experience.'))

Negative


In [129]:
# Save the model
import pickle
pickle.dump(model , open('/content/drive/MyDrive/Gen AI GL/Gen AI Week 1/Sentiment Analysis Project /notebooks/model.pkl' , 'wb'))

In [130]:
pickle.dump(vectorizer , open('/content/drive/MyDrive/Gen AI GL/Gen AI Week 1/Sentiment Analysis Project /notebooks/vectorizer.pkl' , 'wb'))