### Prepare Dataset

In [1]:
import re
import json
import ast
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

In [2]:
# raw file is composed of dictionary-like lines and space lines
# reading the data into lines from the file
with open('sentiment.txt') as f:
    lines = f.readlines()

In [3]:
# convert text and label into lists
texts = []
labels = []
for line in lines[0::2]: # there are space line in between every two 'dictionaries'
    text = json.loads(line[:-1])['text'] # exclude \n
    label = re.findall(r':\)|:\(', text) # extract :) and :( as labels
    texts.append(text)
    labels.append(label)

In [4]:
# combine texts and labels into dataframe
df = pd.DataFrame({'text': texts, 'label': labels}) 
# some labels contain more than one sentiments, might be same or different
df['label'] = df['label'].apply(lambda x: list(set(x))) 
# extract data with only single sentiment
data = df[df.label.isin([[':)'], [':(']])].copy() 
# convert label list to str format
data['label'] = data['label'].apply(lambda x: x[0])
# convert labels to 1 or 0
data.replace({'label': {':)': 1, ':(': 0}}, inplace=True) 

data.head()

Unnamed: 0,text,label
0,we haven’t talked much but u are very kind :),1
1,@SolanaPawnStars cant even connect to site :(,0
2,@hottestsingles excited but cold comfort in th...,0
5,// Back soon :),1
6,RT @wilburtwtt: crimeboystwt follow list!! \n\...,1


### NLP

In [5]:
features = np.array(data['text'])
labels = np.array(data['label'])

In [9]:
processed_features = []
# remove punctuations
for sentence in features:
    stripped = ''.join([char for char in sentence if char not in string.punctuation])
    processed_features.append(stripped)
    
# initialize TfIdf Vectorizer
vectorizer = TfidfVectorizer(use_idf=True, 
                             lowercase=True, 
                             strip_accents='ascii', 
                             stop_words = stopwords.words('english'))
# transform features with vectorizer
X = vectorizer.fit_transform(processed_features)

### Model and Prediction

In [10]:
# split train/test
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.25, random_state=88)

# initialize classifier
clf = naive_bayes.BernoulliNB()
# fit data
clf.fit(X_train, y_train)

BernoulliNB()

In [11]:
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 7271,  1916],
       [ 1053, 10773]], dtype=int64)

In [12]:
accuracy_score(y_test, y_pred)

0.8587065150145148

In [13]:
recall_score(y_test, y_pred)

0.910958904109589

In [14]:
precision_score(y_test, y_pred)

0.8490030735282528