## Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval
import seaborn as sns
from tqdm import tqdm

from keras.datasets import imdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from keras.datasets import imdb

sns.set_style('whitegrid')
plt.rcParams.update({'font.size': 24})

## Load data

In [None]:
df = pd.read_csv('results.csv', index_col=False)

del df['Unnamed: 0']

a = []

df['target'] = df['target'].apply(literal_eval)
joe = df['target'].values
for v in joe:
    a.append(np.array(v))
joe = np.array(a).argmax(1)
df['target'] = joe

df['mean0'] = df['mean0'].astype('float64')
df['std0'] = df['std0'].astype('float64')
df['mean1'] = df['mean1'].astype('float64')
df['std1'] = df['std1'].astype('float64')
display(df.head())

In [None]:
def classify(x, threshold):
    '''
    This function classifies a given review as the
    class with the highest mean, if the std is lower
    than the given threshold. If not, it returns -1
    '''
    mean = np.array([x['mean0'], x['mean1']])
    std = np.array([x['std0'], x['std1']])
    if (std[mean.argmax()] <= threshold):
        return mean.argmax()
    else:
        return -1

In [None]:
print('Max std')
df['std1'].max()

## Precision and recall trade off
This cell runs the classify function for a range of given thresholds and plots the precision and recall curve given the threshold.

In [None]:
thresholds = list(np.arange(0,0.2,0.005))
precisions = []
recalls = []

for t in tqdm(thresholds):
    testdf = df
    testdf['test'] = testdf.apply(classify, args=[t], axis=1)
    certain = df[df['test']!=-1]
    precision = certain['target'] == certain['test']
    recall = len(certain)/12500
    precisions.append(precision.mean())
    recalls.append(recall)

datf = pd.DataFrame({'threshold':thresholds[1:], 'precision':precisions[1:], 'recall':recalls[1:]})
fig, ax = plt.subplots(figsize=(15,10))
sns.lineplot(data=datf, x='threshold', y='precision')
sns.lineplot(data=datf, x='threshold', y='recall')
plt.title('Precision and Recall with uncertainty threshold')
plt.legend(['precision','recall'])
ax.set_xlabel('Uncertainty threshold')
ax.set_ylabel('Precision/recall')
plt.show()

Uncomment to sort the dataframe on how certain it is

In [None]:
sortd = df
# df['test'] = df.apply(classify, args=[1], axis=1)
# sortd = df.sort_values(['std0'])
# sortd = sortd[sortd['target'] != sortd['test']]
# sortd = sortd.reset_index(drop=True)
# sortd.head()

## Qualitative analysis
The cells below show the results for each specific review in the 'sortd' dataframe. It shows the review and an error bar plot showing the mean and std of the prediction for each class. The green colored class is the ground truth.

In [None]:
i = 0

In [None]:
print('Index:',i)
s = sortd.iloc[i]
print('\nReview:\n')
print(s['text'].strip())
if s['target'] == 0:
    colors = ['green', 'red']
else:
    colors = ['red', 'green']

x_0 = [0]
x_1 = [1]
y_0 = [s['mean0']]
y_1 = [s['mean1']]
err_0 = [s['std0']*2]
err_1 = [s['std1']*2]
plt.figure(figsize=(4,8))
plt.errorbar(x_0, y_0, yerr=err_0, fmt='o', capsize=8, color=colors[0])
plt.errorbar(x_1, y_1, yerr=err_1, fmt='o', capsize=8, color=colors[1])

plt.xticks([0, 1], ['negative', 'positive'])
plt.ylim([0,1])
plt.xlim([-0.5, 1.5])
plt.xlabel('Class')
plt.ylabel('Probability')

i += 1

## Confusion matrices

In [None]:
df['y_pred'] = df.apply(classify, args=[1], axis=1)

tdf = df[df['y_pred'] != -1]

cm = confusion_matrix(tdf['target'].values, tdf['y_pred'].values)

df_cm = pd.DataFrame(cm, index = ['negative', 'positive'],
                  columns = ['predicted negative', 'predicted positive'])


fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df_cm, annot=True, fmt='g', cmap='GnBu', ax=ax)
plt.yticks(np.arange(2)+0.5,['actual negative', 'actual positive'], va='center')
plt.title('Confusion matrix without an uncertainty threshold')
plt.show()

In [None]:
df['y_pred'] = df.apply(classify, args=[0.075], axis=1)

tdf = df[df['y_pred'] != -1]

cm = confusion_matrix(tdf['target'].values, tdf['y_pred'].values)

df_cm = pd.DataFrame(cm, index = ['negative', 'positive'],
                  columns = ['predicted negative', 'predicted positive'])

fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(df_cm, annot=True, fmt='g', cmap='GnBu', ax=ax)
plt.yticks(np.arange(2)+0.5,['actual negative', 'actual positive'], va='center')
plt.title('Confusion matrix with an uncertainty threshold of 0.075')
plt.show()

## Mean TF-IDF per document

In [None]:
# calculate td-idf

X = df['text'].tolist()

# vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer(stop_words='english')

X_tfidf = vectorizer.fit_transform(X)

In [None]:
# compute the mean tf-idf per document

tf_idf_avg = []
for document in X_tfidf:
    tf_idf_avg.append(document.mean())

print(tf_idf_avg[:10])

In [None]:
# plot the tf-idf of the first n documents

n = 50
y = tf_idf_avg[:n]
x = np.arange(n)
fig, ax = plt.subplots()
ax.bar(x, y)
plt.xlabel('Document')
plt.ylabel('tf-idf')
plt.show()

In [None]:
# plot the average tf-idf of the correctly and incorrecyly classified reviews

# use a high threshold
threshold = 0.15

sum_correct = 0
sum_incorrect = 0

tfidf_sum_correct = 0
tfidf_sum_incorrect = 0

for index, row in df.iterrows():
    target = row['target']
    predicted = classify(row, threshold)
    if predicted == target:
        tfidf_sum_correct += tf_idf_avg[index]
        sum_correct += 1
    else:
        sum_incorrect += 1
        tfidf_sum_incorrect += tf_idf_avg[index]

print("average of mean correctly classified reviews:", tfidf_sum_correct/sum_correct)
print("average of mean incorrectly classified reviews:", tfidf_sum_incorrect/sum_incorrect)
# print(tfidf_sum_correct)
# print(tfidf_sum_incorrect)
# print('--------')
# print(sum_correct)
# print(sum_incorrect)

In [None]:
# cluster the documents based on their tf-idf

clusters=2
model = KMeans(n_clusters=clusters, max_iter=100)
model.fit(X_tfidf)

In [None]:
# put the results in a pandas dataframe

results = pd.DataFrame({
    'cluster': model.labels_
})
results.head()

In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(" %s" % terms[ind])

In [None]:
test_1 = "i liked the characters, it was a good movie"
print("test 1:", test_1)
test_1 = vectorizer.transform([test_1])
predicted_1 = model.predict(test_1)
print("prediction", predicted_1)

test_2 = "bad, really bad, i hated it"
print("test 2:", test_2)
test_2 = vectorizer.transform([test_2])
predicted_2 = model.predict(test_2)
print("prediction", predicted_2)

## Document length

In [None]:
# save np.load
np_load_old = np.load

# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

# call load_data with allow_pickle implicitly set to true
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)

np.load = np_load_old

In [None]:
x = np.arange(len(x_train))

y = []
for review in x_train:
    y.append(len(review))

# fig, ax = plt.subplots()
# ax.bar(x, y)
# plt.xlabel('document')
# plt.ylabel('length')
# plt.show()

In [None]:
y = np.array(y)
print("average length:", y.mean())
print("standard deviation:", y.std())

## Symbolic approach

In [None]:
# sentiment lexicon
# !wget https://gist.githubusercontent.com/bastings/d6f99dcb6c82231b94b013031356ba05/raw/f80a0281eba8621b122012c89c8b5e2200b39fd6/sent_lexicon

In [None]:
def get_pripo_type():

    with open("sent_lexicon", mode="r", encoding="utf-8") as f:
        lex = f.readlines()

    lex_pripo = {}
    lex_type = {}

    for line in lex:
        holder = []
        for entry in line.split(' '):
            holder.append(entry)

        tag, word = holder[2].split('=')
        tag, lex_pripo[word] = holder[5].split('=')
        lex_pripo[word] = lex_pripo[word][0:-1]
        tag, lex_type[word] = holder[0].split('=')
    return lex_pripo, lex_type

lex_pripo, lex_type = get_pripo_type()

In [None]:
def classify_review(review):
    score = 0
    for word in review.split():
        try:
            if lex_pripo[word] == 'positive':
                score += 1
            if lex_pripo[word] == 'negative':
                score += -1
        except:
            pass
    value = 0
    if score > 3:
        value = 1
    return value

def count_lex(review):
    score = 0
    for word in review.split():
        try:
            if lex_pripo[word] == 'positive':
                score += 1
            if lex_pripo[word] == 'negative':
                score += -1
        except:
            pass
    return score

In [None]:
def norm_dict(my_dict):
    factor = 1.0 / sum(my_dict.values())
    for i in my_dict:
        my_dict[i] = my_dict[i] * factor
    return my_dict

threshold = 0.075


all_reviews = 0
symbolic = 0

oeps = 0
joe1 = []
joe2 = []

bad = []
good = []
bad_dict = dict.fromkeys(range(-22, 35), 0)
good_dict = dict.fromkeys(range(-22, 35), 0)
uncertain_dict = dict.fromkeys(range(-20, 33), 0)

for index, row in df.iterrows():

    all_reviews += 1
    review = row['text']
    
    target = row['target']
    
    std = np.array([row['std0'], row['std1']])
    
    prediction = np.array([row['mean0'], row['mean1']]).argmax()
    
    sentiment = classify_review(review)
    sent_score = count_lex(review)
    
    if sentiment == target:
        symbolic += 1
    
    if (sentiment == target) and (prediction != target):
        oeps += 1
        
    if std[prediction] > threshold:
        uncertain_dict[sent_score] += 1
        if sentiment == target:
            joe1.append(sent_score)
        else:
            joe2.append(sent_score)

    if prediction != target:
        bad.append(sent_score)
        bad_dict[sent_score] += 1
    else:
        good.append(sent_score)
        good_dict[sent_score] += 1
        
print('accuracy given by symbolic approach:', symbolic/all_reviews)
# print(oeps)
print('number of correclty classified by naive approach when model was uncertain', len(joe1))
print('mean sent_score:', sum(joe1)/len(joe1))
print('number of uncorreclty classified by naive approach when model was uncertain', len(joe2))
print('mean sent_score:', sum(joe2)/len(joe2))


bad_dict = norm_dict(bad_dict)
good_dict = norm_dict(good_dict)
uncertain_dict = norm_dict(uncertain_dict)
        
# fig, ax = plt.subplots(figsize=(15,10))
# plt.bar(range(len(bad_dict)), list(bad_dict.values()), align='center')
# plt.xticks(range(len(bad_dict)), list(bad_dict.keys()), rotation='vertical')
# plt.show()

# fig, ax = plt.subplots(figsize=(15,10))
# plt.bar(range(len(good_dict)), list(good_dict.values()), align='center')
# plt.xticks(range(len(good_dict)), list(good_dict.keys()), rotation='vertical')
# plt.show()


fig, ax = plt.subplots(figsize=(15,10))
plt.bar(range(len(uncertain_dict)), list(uncertain_dict.values()), align='center')

x_labels = list(uncertain_dict.keys())
x_labels = range(min(x_labels), max(x_labels), 5)

# plt.xticks(range(len(uncertain_dict)), list(uncertain_dict.keys()), rotation='vertical')

plt.xticks(range(0, len(uncertain_dict), 5), list(x_labels), fontsize=16)

# plt.rc('axes', titlesize=10) 
plt.xlabel('Sentiment score', fontsize=22)
plt.ylabel('Percentage of reviews', fontsize=22)
plt.show()