In [1]:
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")

In [3]:
# Make positive, fact and neutral to be 0
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [5]:
# Include all negatives
negatives = df[df.index.isin(range(40000,50000))]
negatives = negatives[negatives['sentiment']==0]
df_1 = df[0:40000]
df_2 = df[40000:]
frames = [df_1,negatives,df_2]
df = pd.concat(frames)
df = df.reset_index()

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
for text in df['clean']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df['compound'] = compound
df['neg'] = neg
df['pos'] = pos

In [8]:
# Extract polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
for text in df['clean']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df['polarity'] = polarity
df['subjectivity'] = subjectivity

In [9]:
# Normalize data between 0 and 1
from sklearn import preprocessing as pre
import numpy as np

for i in ("compound","polarity","subjectivity"):
    x_array = np.array(list(df[i]))
    x_array = x_array.reshape(-1, 1)
    norm = pre.MinMaxScaler().fit_transform(x_array)
    newList = []
    for j in range(0,len(norm)): newList.append(list(norm[j])[0])
    df[i] = newList

In [36]:
cluster = []
for i in df['compound']:
    if i > 0: cluster.append(1)
    else: cluster.append(-1)
df['cluster'] = cluster
print(len(df[(df['sentiment']==-1)&(df['cluster']==-1)])) # 2435 out of 3990
df = df.drop(['cluster'], axis=1)

In [10]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import nltk

In [21]:
# Solve imbalanced data with SLOVE
from imblearn.over_sampling import SMOTE
X = df[['neg','pos','compound','polarity','subjectivity']]
y = df['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

44391 44391
79906 79906


In [26]:
# Use if using balancing
# Create format for nltk classfiries
features = []
for index, row in X.iterrows():
    obj = {}
    obj['neg'] = row['neg']
    obj['pos'] = row['neg']
    obj['compound'] = row['compound']
    obj['polarity'] = row['polarity']
    obj['subjectivity'] = row['subjectivity']
    features.append((obj,row['sentiment']))

In [11]:
# Skip if using balancing
# Create format for nltk classfiries
features = []
for index, row in df.iterrows():
    obj = {}
    obj['neg'] = row['neg']
    obj['pos'] = row['neg']
    obj['compound'] = row['compound']
    obj['polarity'] = row['polarity']
    obj['subjectivity'] = row['subjectivity']
    features.append((obj,row['sentiment']))

In [27]:
# https://realpython.com/python-nltk-sentiment-analysis/?fbclid=IwAR1_AjswnRENbP3sukZVnThNdsKGhh1yOxPm4vzP2lTKelHNWhNGHoUPL10
from random import shuffle
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}
shuffle(features)
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier = classifier.train(features[:40500])
    accuracy = nltk.classify.accuracy(classifier, features[40500:])
    print(F"{accuracy:.2%} - {name}")

54.19% - BernoulliNB
53.42% - ComplementNB
53.48% - MultinomialNB
71.80% - KNeighborsClassifier
76.20% - DecisionTreeClassifier
77.03% - RandomForestClassifier
53.81% - LogisticRegression
59.46% - MLPClassifier
61.76% - AdaBoostClassifier


In [28]:
predictions = classifier.classify_many([i for i, label in features[40500:]])

In [30]:
# Train the vest classifier again
rfc = nltk.classify.SklearnClassifier(RandomForestClassifier())
rfc = rfc.train(features[:40500])
accuracy = nltk.classify.accuracy(rfc, features[40500:])
print(F"{accuracy:.2%} - {name}")

# Check if predictis 0s
predictions = classifier.classify_many([i for i, label in features[40500:]])
positives,negatives = [],[]
for j in predictions:
    if j == 0: negatives.append(j)
    else: positives.append(j)
        
print(len(positives))
print(len(negatives))

76.94% - AdaBoostClassifier
21881
17525


In [45]:
correct = 0
for i,j in zip(X[40500:]['sentiment'], predictions):
    if i == 1 and i == j: correct = correct + 1

In [46]:
correct # correct negatives 15984 ouf of 35954, correct positives 1911 out of 3452

1911