Imports

In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import zipfile

Load the dataset

In [30]:
with zipfile.ZipFile('sentiment140.zip', 'r') as zip_ref:
    zip_ref.extractall('sentiment140')

df = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=['target', 'id', 'date', 'flag', 'user', 'text'])

df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [31]:
df['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [32]:
df = df[['text', 'target']]
df['target'] = df['target'].replace(4, 1)
df['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

Take the first 5000 records of each class for simplicity

In [33]:
negative_samples = df[df['target'] == 0].head(5000)
positive_samples = df[df['target'] == 1].head(5000)

df = pd.concat([negative_samples, positive_samples])

df['target'].value_counts()

target
0    5000
1    5000
Name: count, dtype: int64

Split data into train and test

In [34]:
X = df['text']
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

TF-IDF vectorization for text feature extraction

In [35]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = vectorizer.transform(X_test).toarray()

Train the model

In [36]:
gnb = GaussianNB()
gnb.fit(X_train_tfidf, y_train)

Test the model

In [37]:
y_pred = gnb.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.6660


Test the model with an external data

In [38]:
review = ["Loved This Movie !!!!"]
review_tfidf = vectorizer.transform(review).toarray()
review_pred = gnb.predict(review_tfidf)

if review_pred[0] == 1:
    print("Sentiment: Positive")
else:
    print("Sentiment: Negative")


Sentiment: Positive
