In [2]:
import numpy as np
import re
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
import pickle


In [3]:
df = pd.read_csv('tweets.csv', encoding='cp1252')
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
column_names = ["target", "id","date", "flag", "user", "text"]
df.columns = column_names
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [5]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)    # Remove mentions (@username)
    text = re.sub(r'#\S+', '', text)    # Remove hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetical characters
    text = text.lower()  # Convert text to lowercase
    return text

df['text'] = df['text'].apply(preprocess_text)
df['text'].head()

0    is upset that he cant update his facebook by t...
1     i dived many times for the ball managed to sa...
2      my whole body feels itchy and like its on fire 
3     no its not behaving at all im mad why am i he...
4                                  not the whole crew 
Name: text, dtype: object

In [6]:
df.shape

(1599999, 6)

In [7]:
# finding the missing values
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

In [8]:
df['target'].value_counts()

target
4    800000
0    799999
Name: count, dtype: int64

convert 4 TO 1

In [9]:
df.replace({'target': {4: 1}}, inplace=True)
df['target'].value_counts()

target
1    800000
0    799999
Name: count, dtype: int64

0----> NEGATIVE
1----> POSITIVE

In [10]:
# STEMMING
!pip install nltk
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Stemming is a process in natural language processing (NLP) where words are reduced to their base or root form. This is done by removing suffixes (or sometimes prefixes) from words. The idea is to group different forms of the same word under a common root form. For example:

"running" → "run"
"better" → "better" (no change, since "better" is already a base form)
"cats" → "cat"

In [11]:
def stemming(content):
    stemmer = nltk.SnowballStemmer("english")
    stemmed_content = content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [stemmer.stem(word) for word in stemmed_content]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content


In [12]:
df['stemmed_content'] = df['text'].apply(stemming)
df.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he cant update his facebook by t...,is upset that he cant updat his facebook by te...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,i dived many times for the ball managed to sa...,i dive mani time for the ball manag to save th...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole bodi feel itchi and like it on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,no its not behaving at all im mad why am i he...,no it not behav at all im mad whi am i here be...
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,not the whole crew,not the whole crew


In [13]:
print(df['stemmed_content'])

0          is upset that he cant updat his facebook by te...
1          i dive mani time for the ball manag to save th...
2               my whole bodi feel itchi and like it on fire
3          no it not behav at all im mad whi am i here be...
4                                         not the whole crew
                                 ...                        
1599994    just woke up have no school is the best feel ever
1599995       thewdbcom veri cool to hear old walt interview
1599996    are you readi for your mojo makeov ask me for ...
1599997    happi th birthday to my boo of alll time tupac...
1599998                                                happi
Name: stemmed_content, Length: 1599999, dtype: object


In [14]:
print(df['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599994    1
1599995    1
1599996    1
1599997    1
1599998    1
Name: target, Length: 1599999, dtype: int64


In [15]:
# Sperating the labels
X = df['stemmed_content'].values
y = df['target'].values

In [16]:
# Splittinf the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [17]:
print(X.shape, X_train.shape, X_test.shape)

(1599999,) (1279999,) (320000,)


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorized = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words='english')
X_train = vectorized.fit_transform(X_train)
X_test = vectorized.transform(X_test)

In [19]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9826979 stored elements and shape (1279999, 50000)>
  Coords	Values
  (0, 46390)	0.22652108589817208
  (0, 35820)	0.2971851532101235
  (0, 21006)	0.44159895630719215
  (0, 10268)	0.31208347566980676
  (0, 24207)	0.3498348512216682
  (0, 47808)	0.37393359241016555
  (0, 46613)	0.5529490877966808
  (1, 18145)	0.39541556965139923
  (1, 1481)	0.2512308044230782
  (1, 10950)	0.4285914076782298
  (1, 15687)	0.500453429748749
  (1, 32651)	0.31190176986857265
  (1, 7810)	0.32797506509514407
  (1, 19271)	0.3762114069668644
  (2, 33370)	0.43741366750533095
  (2, 22554)	0.1958574087107581
  (2, 28382)	0.5060622334294136
  (2, 48378)	0.17636333727525846
  (2, 42928)	0.21650347354353053
  (2, 11356)	0.24752004526089436
  (2, 10855)	0.2969414087467539
  (2, 8233)	0.1678267988916983
  (2, 48698)	0.33420004912993095
  (2, 11374)	0.38332298879489474
  (3, 8233)	0.2898499540527057
  :	:
  (1279997, 28567)	0.38787188853361343
  (1279997, 10041

In [20]:
# training the machine learning model
# using various learinging teachniques
model = MultinomialNB()
model.fit(X_train, y_train)


In [21]:
# evaluation
y_pred = model.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

0.767565625


In [22]:
# !pip install xgboost
# import xgboost as xgb
# model = xgb.XGBClassifier(max_depth=6, n_estimators=200, learning_rate=0.1)
# model.fit(X_train, y_train)

In [23]:
# y_pred = model.predict(X_test)
# acc_score = accuracy_score(y_test, y_pred)
# print(acc_score)


In [24]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

0.7832375


In [25]:
import pickle
filename ='sentiment_model.sav'
pickle.dump(model,open(filename, 'wb'))

In [27]:
with open('vectorizer.sav', 'wb') as vectorizer_file:
    pickle.dump(vectorized, vectorizer_file)