## Dataloading

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

### Load data

In [None]:
df = pd.read_csv('results.csv')

del df['Unnamed: 0']
display(df.head())

In [None]:
def classify(x, threshold):
    mean = np.array([x['mean0'], x['mean1']])
    std = np.array([x['std0'], x['std1']])
    if (std[mean.argmax()] < threshold):
        return mean.argmax()
    else:
        return -1

In [None]:
df['test'] = df.apply(classify, args=[0.01], axis=1)

In [None]:
df.head(15)

In [None]:
certain = df[df['test']!=-1]
precision = certain['target'] == certain['test']
print('precision',precision.mean())

recall = df['target'] == df['test']
print('recall',recall.mean())

In [None]:
thresholds = list(np.arange(0,1,0.05))
precisions = []
recalls = []

for t in thresholds:
    testdf = df
    testdf['test'] = testdf.apply(classify, args=[1-t], axis=1)
    certain = df[df['test']!=-1]
    precision = certain['target'] == certain['test']
#     print('precision',precision.mean())
    recall = df['target'] == df['test']
#     print('recall',recall.mean())
    precisions.append(precision.mean())
    recalls.append(recall.mean())
    
plt.plot(thresholds, precisions)
plt.plot(thresholds, recalls)
plt.title('Precision and Recall with confidence threshold')
plt.legend(['precision','recall'])
plt.set_xlabel('confidence threshold')
plt.set_ylabel('precision/recall')

In [None]:
i = 62

In [None]:
print(i)
s = df.iloc[i]
if s['target'] == 0:
    colors = ['green', 'red']
else:
    colors = ['red', 'green']

x_0 = [0]
x_1 = [1]
y_0 = [s['mean0']]
y_1 = [s['mean1']]
err_0 = [s['std0']]
err_1 = [s['std1']]
plt.figure(figsize=(2,4))
plt.errorbar(x_0, y_0, yerr=err_0, fmt='o', capsize=8, color=colors[0])
plt.errorbar(x_1, y_1, yerr=err_1, fmt='o', capsize=8, color=colors[1])

plt.xticks([0, 1])
plt.xlim([-0.5, 1.5])
plt.xlabel('Class')
plt.ylabel('Probability')
i += 1

# Mean TF-IDF per document

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from keras.datasets import imdb

In [None]:
# credits to:
# https://gist.github.com/prinsherbert/92313f15fc814d6eed1e36ab4df1f92d 

word_to_index = imdb.get_word_index()
index_to_word = [None] * (max(word_to_index.values()) + 1)

for w, i in word_to_index.items():
    index_to_word[i] = w
    
(X_train, y_train), _ = imdb.load_data()

X_train = [
    ' '.join(
        index_to_word[i]
        for i in X_train[i]
        if i < len(index_to_word)
    ) for i in range(X_train.shape[0])
]

# X_test = [
#     ' '.join(
#         index_to_word[i]
#         for i in X_test[i]
#         if i < len(index_to_word)
#     ) for i in range(X_test.shape[0])
# ]

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train)

In [None]:
tf_idf_avg = []
for document in X:
    tf_idf_avg.append(document.mean())

print(tf_idf_avg[:10])