In [None]:
!unzip /content/drive/MyDrive/datasets/glove.zip -d /content/

In [None]:
!apt install libomp-dev
!pip install faiss
!pip install faiss-gpu
import faiss  

In [None]:
!pip install --upgrade xgboost
from xgboost import XGBClassifier

In [None]:
import pandas as pd
import numpy as np
import re
import sys
import json
import csv
import gc
from collections import Counter
import glob
import os.path
import time
import nltk
from random import *
import ast
import os
import matplotlib.pyplot as plt
import math
from google.colab import drive
import requests_oauthlib
from scipy.spatial import distance
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras import datasets, layers, models
import tensorflow as tf
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')
#drive.flush_and_unmount()
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)  # or 199

In [None]:
df = pd.read_csv("/content/drive/MyDrive/datasets/keywords_freq.csv")
df["rel"] = (df["pol_freq"]*df["total_freq"].sum())/(df["total_freq"]*df["pol_freq"].sum())

In [None]:
wfd = dict(zip(df.word, df.total_freq))
def get_most_common(x):  #get most frequent word from a phrase.
  if len(x)<2:
    return x
  freq = 0
  word_f = x[0]
  for word in x.split():
    if (word in ["null","nan"]):
      continue
    elif (wfd[word]>freq):
      word_f = word
      freq = wfd[word]
  return word_f

def split_words(lst):
  ret_list = []
  for i in lst:
    ret_list.append(get_most_common(i))
  return ret_list

In [None]:
df_main = pd.read_csv("/content/drive/MyDrive/datasets/keywords.csv")
df_main.keywords = df_main.keywords.apply(ast.literal_eval)
df_main.scores = df_main.scores.apply(ast.literal_eval)
df_main["keywords"] = df_main["keywords"].apply(split_words)
df_main = df_main.sample(frac=1).reset_index(drop=True)

In [None]:
split_pos = 0.01 #imbalance test set
split_neg = 0.5

df_main = df_main[~df_main["subreddit"].isin(("badeconomics","askeconomics", "economics","politicalcompassMemes"))]
df_pos = df_main[df_main["label"]==1]
df_neg = df_main[df_main["label"]==0]

df_pos_train,df_pos_test = train_test_split(df_pos, test_size=split_pos)
df_neg_train,df_neg_test = train_test_split(df_neg, test_size=split_neg)

df_train = pd.concat([df_pos_train, df_neg_train], ignore_index=True)
df_test = pd.concat([df_pos_test, df_neg_test], ignore_index=True)

In [None]:
print(df_pos_train.info())
print(df_pos_test.info())
print(df_neg_train.info())
print(df_neg_test.info())

In [None]:
rel = dict(zip(df.word, df.rel))

def get_params(x,score):
  try:
    ret = [rel[x]] #[0 if rel[x]<1.5 else 1]
  except:
    ret = [0.0]
  return ret
def get_x(xs,scores):
  ret_val = []
  for i in range(len(xs)):
    ret_val.extend(get_params(xs[i],scores[i]))
  ret_val = np.sort(np.array(ret_val))
  ret_val = ret_val[::-1]
  ret_val = np.pad(ret_val, (0, max(0,10-len(ret_val))), 'constant') #[np.sum(ret_val)/10]
  return ret_val

In [None]:
x_train = np.stack(df_train.apply(lambda row: get_x(row["keywords"],row["scores"]), axis=1))
y_train = np.array(df_train.label.astype(int))

x_test = np.stack(df_test.apply(lambda row: get_x(row["keywords"],row["scores"]), axis=1))
y_test = np.array(df_test.label.astype(int))

In [None]:
def plot_cm(cf_matrix):
  group_names = ["True Neg","False Pos","False Neg","True Pos"]
  group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
  group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
  labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
  labels = np.asarray(labels).reshape(2,2)
  sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

Logistic Regression

In [None]:
lr = LogisticRegression(random_state=0,max_iter=200)
lr.fit(x_train, y_train)
print(f"Prediction Accuracy: {lr.score(x_test, y_test)}")
y_pred = lr.predict(x_test)
cm = metrics.confusion_matrix(y_test, y_pred)
plot_cm(cm)

In [None]:
wrong = df_test[(y_pred==0) & (y_test == 1)].keywords
for i in wrong.sample(n=28):
  #pass
  #print(i)
  print(dict(zip(i, get_x(i,[i for i in range(10)]))))

KNN

In [None]:
class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = self.y[indices]
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions

In [None]:
fknn = FaissKNeighbors(k=5)
fknn.fit(x_train,y_train)

In [None]:
y_pred = fknn.predict(x_test)
cm = metrics.confusion_matrix(y_test, y_pred)
print(metrics.accuracy_score(y_test,y_pred))
plot_cm(cm)

XGBoost

In [None]:
xgb = XGBClassifier(n_estimators=100)
training_start = time.perf_counter()
xgb.fit(x_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()
y_pred = xgb.predict(x_test)
prediction_end = time.perf_counter()
acc_xgb = (y_pred == y_test).sum().astype(float) / len(y_pred)*100
xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print(f"XGBoost's prediction accuracy is: {acc_xgb}")
print(f"Time consumed for training: {xgb_train_time}")
print(f"Time consumed for prediction: {xgb_prediction_time}")

Random Forest

In [None]:
clf=RandomForestClassifier(n_estimators=50,max_depth=40)
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Simple Neural Network

In [None]:
model = Sequential()
model.add(Dense(30))
model.add(Dropout(0.2))
model.add(Dense(10))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train, epochs=3,batch_size=10, verbose=1)
_, accuracy = model.evaluate(x_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Using Embeddings

In [None]:
embeddings= {}
with open("/content/glove/glove.840B.300d.txt", 'r') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings[word]=vector

In [None]:
def get_embedding(x):
  try:
    return embeddings[x]
  except:
    return np.zeros(shape=(300))

In [None]:
word_to_ind = {}
ind_to_word = {}
embeddings_ds = []

for ind,word in enumerate(list(df["word"])):
  word_to_ind[word]=ind
  ind_to_word[ind]=word
  embeddings_ds.append(get_embedding(word))
embeddings_ds = np.array(embeddings_ds,dtype="float32")

In [None]:
embeddings_ds[:10]

In [None]:
index = faiss.IndexFlatL2(300)  
index.add(embeddings_ds)
print(index.ntotal)

In [None]:
get_nearest(x):
  D, I = index.search(np.array([get_embedding(x), 1) # sanity check
  return ind_to_word[I[0][0]]