In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
print(os.getcwd())
os.chdir('/content/drive/My Drive/1006')
print(os.getcwd())

/content
/content/drive/My Drive/1006


In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

In [4]:
dataset_name = 'news'

In [5]:
embed_types = ['roberta', 'distil', 'glove6B', 'universal', 'tsne10']
counts = [100, 400, 800, 1200, 1800, 2400, 3000, 4000, 5000]

## Results Dict



In [6]:
key_list = ['random_noembed', '12topics', '24topics', 
            'roberta_kmeans', 'distil_kmeans', 'glove6B_kmeans', 'universal_kmeans', 'tsne10_kmeans', 
            'roberta_var', 'distil_var', 'glove6B_var', 'universal_var', 'tsne10_var', 
            'roberta_kld', 'distil_kld', 'glove6B_kld', 'universal_kld', 'tsne10_kld', 
            'roberta_taddy', 'distil_taddy', 'glove6B_taddy', 'universal_taddy', 'tsne10_taddy', 
            'roberta_recon', 'distil_recon', 'glove6B_recon', 'universal_recon', 'tsne10_recon',]
acc_dict = {k: [] for k in key_list}
f1_dict = {k: [] for k in key_list}

## Complete dataset

In [7]:
# Training
data = pd.read_csv(dataset_name+'_cvec_train.csv', index_col=0)
y_train = data['label'].to_numpy(dtype=np.int16)
X_train = data.drop(columns=['label']).to_numpy(dtype=np.int16)

print('Train data done')

# Evaluation
data = pd.read_csv(dataset_name+'_cvec_test.csv', index_col=0)
y_test = data['label'].to_numpy(dtype=np.int16)
X_test = data.drop(columns=['label']).to_numpy(dtype=np.int16)

print('Eval data done')

del data

Train data done
Eval data done


In [8]:
mnb = MultinomialNB(alpha=0.01)
mnb.fit(X_train, y_train)
print(accuracy_score(y_test, mnb.predict(X_test)))
print(f1_score(y_test, mnb.predict(X_test), average='weighted'))

0.7229047229047229
0.713102140007122


## Random Pick

In [None]:
indices_list = []
with open("indices_news_random.txt") as fh: 
  lines = fh.readlines() 
  for line in lines:
    indices_list.append(ast.literal_eval(line))

In [None]:
for lst in indices_list:
  mnb = MultinomialNB(alpha=1e-3)
  mnb.fit(X_train[lst], y_train[lst])
  acc_dict['random_noembed'].append(accuracy_score(y_test, mnb.predict(X_test)))
  f1_dict['random_noembed'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## K-means Clustering

In [None]:
for j in range(len(embed_types)):
  indices_list = []
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_kmeans.txt') as fh:
      lines = fh.readlines() 
      for line in lines:
        indices_list.append(ast.literal_eval(line))
  for lst in indices_list:
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[lst], y_train[lst])
    acc_dict[embed_types[j]+'_kmeans'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[j]+'_kmeans'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Variance Thresholding

In [None]:
indices_list = []
for j in range(len(embed_types)):
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_variance.txt') as fh:
    lines = fh.readlines() 
    for line in lines:
      indices_list.append(ast.literal_eval(line))

In [None]:
for i, lst in enumerate(indices_list):
  for c in counts:
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[lst[:c]], y_train[lst[:c]])
    acc_dict[embed_types[i]+'_var'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_var'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Greedy farthest points based on KL Divergence

In [13]:
indices_list = []
for j in range(len(embed_types)):
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_kld.txt') as fh:
    lines = fh.readlines()
    for line in lines:
      indices_list.append(ast.literal_eval(line))

In [14]:
for i, lst in enumerate(indices_list):
  for c in counts:
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[lst[:c]], y_train[lst[:c]])
    acc_dict[embed_types[i]+'_kld'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_kld'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Reconstruction Loss Minimization

In [None]:
indices_list = []
for j in range(len(embed_types)):
  indices_list.append(list(np.load('indices_'+dataset_name+'_'+embed_types[j]+'_recon.npy')))

In [None]:
for i, lst in enumerate(indices_list):
  for c in counts:
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[lst[:c]], y_train[lst[:c]])
    acc_dict[embed_types[i]+'_recon'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_recon'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Taddy

In [None]:
for i in range(len(embed_types)):
  indices_list = np.load('indices_'+dataset_name+'_'+embed_types[i]+'_taddy.npy', allow_pickle=True)
  for j in range(len(indices_list)):
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[indices_list[j][0]], y_train[indices_list[j][0]])
    acc_dict[embed_types[i]+'_taddy'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_taddy'].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Taddy Topics

In [None]:
for embed in ['12topics', '24topics']:
  indices_list = np.load('indices_'+dataset_name+'_'+embed+'.npy', allow_pickle=True)
  for j in range(len(indices_list)):
    mnb = MultinomialNB(alpha=1e-3)
    mnb.fit(X_train[indices_list[j][0]], y_train[indices_list[j][0]])
    acc_dict[embed].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed].append(f1_score(y_test, mnb.predict(X_test), average='weighted'))

## Save dicts to csv

In [21]:
pd.DataFrame.from_dict(f1_dict, orient='index').transpose()

Unnamed: 0,random_noembed,12topics,24topics,roberta_kmeans,distil_kmeans,glove6B_kmeans,universal_kmeans,tsne10_kmeans,roberta_var,distil_var,glove6B_var,universal_var,tsne10_var,roberta_kld,distil_kld,glove6B_kld,universal_kld,tsne10_kld,roberta_taddy,distil_taddy,glove6B_taddy,universal_taddy,tsne10_taddy,roberta_recon,distil_recon,glove6B_recon,universal_recon,tsne10_recon
0,,,,,,,,,,,,,,0.145662,0.178153,0.105168,0.178609,0.231055,,,,,,,,,,
1,,,,,,,,,,,,,,0.302574,0.35688,0.286446,0.390882,0.413088,,,,,,,,,,
2,,,,,,,,,,,,,,0.431728,0.452104,0.390903,0.487374,0.49783,,,,,,,,,,
3,,,,,,,,,,,,,,0.479945,0.499449,0.461836,0.52417,0.530953,,,,,,,,,,
4,,,,,,,,,,,,,,0.545122,0.552253,0.527772,0.574914,0.569572,,,,,,,,,,
5,,,,,,,,,,,,,,0.575209,0.577159,0.580288,0.598683,0.601638,,,,,,,,,,
6,,,,,,,,,,,,,,0.599923,0.606542,0.596276,0.627283,0.61286,,,,,,,,,,
7,,,,,,,,,,,,,,0.622973,0.634115,0.632127,0.658111,0.630368,,,,,,,,,,
8,,,,,,,,,,,,,,0.633425,0.635681,0.651277,0.67599,0.653979,,,,,,,,,,


In [20]:
pd.DataFrame.from_dict(acc_dict, orient='index').transpose().to_csv('news_acc_m.csv')
pd.DataFrame.from_dict(f1_dict, orient='index').transpose().to_csv('news_f1_m.csv')