In [1]:
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")

In [2]:
# Make positive, fact and neutral to be 0
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [3]:
# Include all negatives
negatives = df[df.index.isin(range(40000,50000))]
negatives = negatives[negatives['sentiment']==0]
df_1 = df[0:40000]
df_2 = df[40000:]
frames = [df_1,negatives,df_2]
df = pd.concat(frames)
df = df.reset_index()

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
for text in df['clean']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df['compound'] = compound
df['neg'] = neg
df['pos'] = pos

In [5]:
# Extract polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
for text in df['clean']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df['polarity'] = polarity
df['subjectivity'] = subjectivity

In [6]:
# Normalize data between 0 and 1
from sklearn import preprocessing as pre
import numpy as np

for i in ("compound","polarity","subjectivity"):
    x_array = np.array(list(df[i]))
    x_array = x_array.reshape(-1, 1)
    norm = pre.MinMaxScaler().fit_transform(x_array)
    newList = []
    for j in range(0,len(norm)): newList.append(list(norm[j])[0])
    df[i] = newList

In [7]:
cluster = []
for i in df['compound']:
    if i > 0: cluster.append(1)
    else: cluster.append(-1)
df['cluster'] = cluster
print(len(df[(df['sentiment']==-1)&(df['cluster']==-1)])) # 2435 out of 3990
df = df.drop(['cluster'], axis=1)

0


In [8]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import nltk

In [9]:
# Solve imbalanced data with SLOVE
from imblearn.over_sampling import SMOTE
X = df[['neg','pos','compound','polarity','subjectivity']]
y = df['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

44391 44391
79906 79906


In [10]:
# Use if using balancing
# Create format for nltk classfiries
features = []
for tupled, row2 in zip(X.iterrows(), y):
    obj = {}
    obj['neg'] = tupled[1]['neg']
    obj['pos'] = tupled[1]['neg']
    obj['compound'] = tupled[1]['compound']
    obj['polarity'] = tupled[1]['polarity']
    obj['subjectivity'] = tupled[1]['subjectivity']
    features.append((obj,row2))

In [11]:
# Skip if using balancing
# Create format for nltk classfiries
features = []
for index, row in df.iterrows():
    obj = {}
    obj['neg'] = row['neg']
    obj['pos'] = row['neg']
    obj['compound'] = row['compound']
    obj['polarity'] = row['polarity']
    obj['subjectivity'] = row['subjectivity']
    features.append((obj,row['sentiment']))

In [11]:
import numpy as np
from random import shuffle
from sklearn.model_selection import train_test_split

shuffle(features)
threshold = round(len(features)/100*20)
test = features[:threshold]
train = features[threshold:]

In [12]:
# https://realpython.com/python-nltk-sentiment-analysis/?fbclid=IwAR1_AjswnRENbP3sukZVnThNdsKGhh1yOxPm4vzP2lTKelHNWhNGHoUPL10
from sklearn import metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score

classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier = classifier.train(train)
    accuracy = nltk.classify.accuracy(classifier, test)
    print(F"{accuracy:.2%} - {name}")
    
    y_pred = classifier.classify_many([i for i, label in test])
    y_test = [label for i, label in test]      
    print(metrics.classification_report(y_test, y_pred))
    print(matthews_corrcoef(y_test, y_pred))
    print(roc_auc_score(y_test, y_pred))

54.58% - BernoulliNB
              precision    recall  f1-score   support

           0       0.55      0.50      0.52      7971
           1       0.54      0.59      0.57      8010

    accuracy                           0.55     15981
   macro avg       0.55      0.55      0.54     15981
weighted avg       0.55      0.55      0.54     15981

0.09187341804824685
0.5457166122324513
53.59% - ComplementNB
              precision    recall  f1-score   support

           0       0.54      0.46      0.50      7971
           1       0.53      0.61      0.57      8010

    accuracy                           0.54     15981
   macro avg       0.54      0.54      0.53     15981
weighted avg       0.54      0.54      0.53     15981

0.07224354807646823
0.5357008105694003
53.63% - MultinomialNB
              precision    recall  f1-score   support

           0       0.54      0.46      0.50      7971
           1       0.53      0.61      0.57      8010

    accuracy                          

In [28]:
predictions = classifier.classify_many([i for i, label in features[40500:]])

In [13]:
# Train the best classifier again
classifier = nltk.classify.SklearnClassifier(RandomForestClassifier())
classifier = classifier.train(train)
accuracy = nltk.classify.accuracy(classifier, test)
print(F"{accuracy:.2%} - {name}")

# rfc = nltk.classify.SklearnClassifier(RandomForestClassifier())
# rfc = rfc.train(features[:40500])
# accuracy = nltk.classify.accuracy(rfc, features[40500:])
# print(F"{accuracy:.2%} - {name}")

# # Check if predictis 0s
# predictions = classifier.classify_many([i for i, label in features[40500:]])
# positives,negatives = [],[]
# for j in predictions:
#     if j == 0: negatives.append(j)
#     else: positives.append(j)
        
# print(len(positives))
# print(len(negatives))

78.78% - AdaBoostClassifier


### Classify unlabeled

In [14]:
# Classify unlabaled data
unlabaled = pd.read_pickle("preprocessed.pkl")

# Predict
def decode_sentiment(score):
    if score > 0.5: return 1
    elif score <= 0.5: return 0

def get_features(text):
    features = []
    # Add features for text
    features.append(sia.polarity_scores(text)['compound'])
    features.append(sia.polarity_scores(text)['neg'])
    features.append(sia.polarity_scores(text)['pos'])
    features.append(TextBlob(text).sentiment.polarity)
    features.append(TextBlob(text).sentiment.subjectivity)
    return features

def predict(features):
    scores = model.predict(np.array(features))
    return scores

negatives = []
features = []
for index, row in unlabaled.iterrows():
    vector = get_features(row["clean"])
    features.append(vector)

In [15]:
# Create the pandas DataFrame
features = pd.DataFrame(features)
 
# specifying column names
features.columns = ['compound', 'neg', 'pos', 'polarity', 'subjectivity']

In [18]:
# Normalize data between 0 and 1
from sklearn import preprocessing as pre
import numpy as np

for i in ("compound","polarity","subjectivity"):
    x_array = np.array(list(features[i]))
    x_array = x_array.reshape(-1, 1)
    norm = pre.MinMaxScaler().fit_transform(x_array)
    newList = []
    for j in range(0,len(norm)): newList.append(list(norm[j])[0])
    features[i] = newList

In [20]:
features_format = []
for index, row in features.iterrows():
    obj = {}
    obj['neg'] = row['neg']
    obj['pos'] = row['neg']
    obj['compound'] = row['compound']
    obj['polarity'] = row['polarity']
    obj['subjectivity'] = row['subjectivity']
    features_format.append((obj,0))

In [21]:
y_pred = classifier.classify_many([i for i, label in features_format])

In [22]:
negatives = []
scores = y_pred
for i in range(0,len(scores)):
    if scores[i]<0.5: negatives.append(i)

In [25]:
len(scores)

313985

In [23]:
len(negatives)

109340

In [24]:
with open('randomforest_negatives.txt','w') as tfile:
    tfile.write(str(negatives))