In [None]:
!unzip "/content/drive/MyDrive/datasets/StumbleUpon/test.tsv.zip"
!unzip "/content/drive/MyDrive/datasets/StumbleUpon/train.tsv.zip"
#!unzip "/content/drive/MyDrive/datasets/glove.42B.300d.zip"

In [None]:
!pip3 install tldextract
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
from random import *
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
import tldextract
import torch
import json
import torch.nn as nn
from torch.optim import SGD
from torch.nn import Linear,Sigmoid
import torch.utils.data as D
import torchsummary
import torchtext
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

In [56]:
stopwords = []
f = open("/content/drive/MyDrive/datasets/stopwords.txt","r")
for word in f.readlines():
  stopwords.append(word.strip())

In [57]:
#confusion matrix
def plot_cm(y_test, y_pred):
  cf_matrix = metrics.confusion_matrix(y_test, y_pred)
  group_names = ["True Neg","False Pos","False Neg","True Pos"]
  group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
  group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
  labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
  labels = np.asarray(labels).reshape(2,2)
  sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='Blues')

In [58]:
#replace category with relative frequency
def replace_word(df,col_names):
  for col_name in col_names:
    rel = {}
    for i in df[col_name].unique():
      df_t = df[df[col_name]==i]
      rel[i] = len(df_t[df_t["label"]==1])/len(df_t)
    df[col_name] = df[col_name].apply(lambda x:rel[x]).astype('float64')

#preprocessing
def get_df(name,sep='\t'):
  df = pd.read_csv(name,sep='\t')
  domainExt = lambda x: pd.Series([tldextract.extract(x["url"]).domain,tldextract.extract(x["url"]).suffix])
  df[['domain', 'suffix']] = df.apply(domainExt, axis=1)
  textExt = lambda x: pd.Series([json.loads(x["boilerplate"]).get("url"),json.loads(x["boilerplate"]).get("title"),json.loads(x["boilerplate"]).get("body")])
  df[['urlText','title', 'body']] = df.apply(textExt, axis=1)
  df.fillna("",inplace=True)
  df["urlText"] = df["urlText"].apply(lambda x: ' '.join([lemmatizer.lemmatize(z) for z in simple_preprocess(x)]))
  df["title"] = df["title"].apply(lambda x: ' '.join([lemmatizer.lemmatize(z) for z in simple_preprocess(x)]))
  df["body"] = df["body"].apply(lambda x: ' '.join([lemmatizer.lemmatize(z) for z in simple_preprocess(x)]))
  df['text'] = df.apply(lambda x:" ".join([x['urlText'],x['urlText'],x['title'],x['title'],x['title'],x['body']]),axis=1)
  df["alchemy_category_score"]= df["alchemy_category_score"].replace('?',0.5).astype('float64')
  df["is_news"]= df["is_news"].replace('?',0).astype('int')
  df["news_front_page"]= df["news_front_page"].replace('?',0).astype('int')
  df["alchemy_category"]= df["alchemy_category"].replace('?',"unknown")
  df.drop(columns=["url", "urlid","boilerplate","framebased"],inplace=True)
  #replace_word(df,["suffix","domain","alchemy_category"])
  return df


In [59]:
#for creating imbalanced test set set unequal values.
split_pos = 0.1 #test size for positive
split_neg = 0.1 #test size for negative

df_main = get_df("./train.tsv")
df_pos = df_main[df_main["label"]==1]
df_neg = df_main[df_main["label"]==0]

df_pos_train,df_pos_test = train_test_split(df_pos, test_size=split_pos)
df_neg_train,df_neg_test = train_test_split(df_neg, test_size=split_neg)

df_train = pd.concat([df_pos_train, df_neg_train], ignore_index=True)
df_test = pd.concat([df_pos_test, df_neg_test], ignore_index=True)

del df_pos,df_neg,df_pos_train,df_neg_train,df_pos_test,df_neg_test

In [None]:
#tfidf vectors used in logistic regression.
urlTFIDF = TfidfVectorizer(use_idf=True,min_df = 5)
urlTFIDF.fit(df_main["urlText"])
titleTFIDF = TfidfVectorizer(use_idf=True,min_df = 5)
titleTFIDF.fit(df_main["title"])
bodyTFIDF = TfidfVectorizer(use_idf=True,min_df = 5)
bodyTFIDF.fit(df_main["body"])

pipe = Pipeline(steps=[('scale', StandardScaler()),('pca', PCA(n_components=0.80))])
pipe.fit(df_main.drop(columns=["label"]).select_dtypes('number').to_numpy())

In [61]:
def get_data(df):
  x1 = pipe.transform(df.drop(columns=["label"]).select_dtypes('number').to_numpy())
  x2 = urlTFIDF.transform(df["urlText"])
  x3 = titleTFIDF.transform(df["title"])
  x4 = bodyTFIDF.transform(df["body"])
  x5 = pd.get_dummies(df_main.alchemy_category, prefix='category',dtype='float').to_numpy()
  y = df["label"].to_numpy()
  return x1,x2,x3,x4,x5,y

In [62]:
def pcaPlot(x):
  pca = PCA()
  pca.fit(x)
  cumsum = np.cumsum(pca.explained_variance_ratio_)*100
  d = [n for n in range(len(cumsum))]
  plt.figure(figsize=(10, 10))
  plt.plot(d,cumsum, color = 'red',label='cumulative explained variance')
  plt.title('Cumulative Explained Variance as a Function of the Number of Components')
  plt.ylabel('Cumulative Explained variance')
  plt.xlabel('Principal components')
  plt.axhline(y = 95, color='k', linestyle='--', label = '95% Explained Variance')
  plt.legend(loc='best')


In [63]:
#sparse matrix to pytorch tensor
def csrToTorch(x):
  coo = x.tocoo()
  values = coo.data
  indices = np.vstack((coo.row, coo.col))
  i = torch.LongTensor(indices)
  v = torch.FloatTensor(values)
  shape = coo.shape
  #print(shape)
  return torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

In [64]:
device = 'cpu'
class LogisticLayer(nn.Module):
    def __init__(self, n_inputs):
        super(LogisticLayer, self).__init__()
        self.hidden1 = Linear(n_inputs, 1)
        #self.hidden2 = Linear(3, 1)
        self.act = Sigmoid()

    def forward(self, X):
        X = self.hidden1(X)
        #X = self.hidden2(X)
        X = self.act(X)
        return X

class EnsembleLogistics(nn.Module):
    def __init__(self,input_x1,input_x2,input_x3,input_x4,input_x5):
        super(EnsembleLogistics, self).__init__()

        self.m1 = LogisticLayer(input_x1)
        self.m2 = LogisticLayer(input_x2)
        self.m3 = LogisticLayer(input_x3)
        self.m4 = LogisticLayer(input_x4)
        self.m5 = LogisticLayer(input_x5)
        self.m0 = LogisticLayer(5)

        
    def forward(self, x1, x2, x3,x4,x5):
        x1 = torch.flatten(self.m1(x1))
        x2 = torch.flatten(self.m2(x2))
        x3 = torch.flatten(self.m3(x3))
        x4 = torch.flatten(self.m4(x4))
        x5 = torch.flatten(self.m5(x5))#.reshape(-1,1)
        x = torch.stack((x1, x2, x3,x4,x5),1)
        #print(x)
        #print(x5)
        #x = x5*x
        #print(x)
        x = torch.flatten(self.m0(x))
        return x


class EnsembleDatasets(D.Dataset):
    
    def __init__(self, x1,x2_csr,x3_csr,x4_csr,x5, label, device='cpu'):
        self.device = torch.device(device)
        self.x1 = torch.tensor(x1,dtype = torch.float32)
        self.x2_csr = x2_csr
        self.x3_csr = x3_csr
        self.x4_csr = x4_csr
        self.x5 = torch.tensor(x5,dtype = torch.float32)
        self.label = torch.tensor(label, dtype = torch.float32, device=self.device)
    
    def __len__(self):
        return self.label.shape[0]
    
    def __getitem__(self, index):
        x1 = self.x1[index]
        x2 = csrToTorch(self.x2_csr[index])
        x3 = csrToTorch(self.x3_csr[index])
        x4 = csrToTorch(self.x4_csr[index])
        x5 = self.x5[index]
        return x1,x2,x3,x4,x5,self.label[index]

In [65]:
def train_model(model,x1,x2_csr,x3_csr,x4_csr,x5, label,batchsize,epochs):
    data_tr = EnsembleDatasets(x1,x2_csr,x3_csr,x4_csr,x5,label, device)
    criterion = nn.BCELoss()
    optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
    train_ldr = D.DataLoader(dataset=data_tr, batch_size=batchsize, shuffle=True)

    for epoch in range(epochs):
      print(f"epoch: {epoch}")
      for x1,x2,x3,x4,x5,label in train_ldr:
        optimizer.zero_grad()
        yhat = model(x1,x2,x3,x4,x5)
        loss = criterion(yhat,label)
        loss.backward()
        optimizer.step()
      #yhat = (yhat>0.5).float()
      #correct = (yhat == label).float().sum()
      #print("Accuracy: {:.3f}".format(correct/label.shape[0]))


In [None]:
x1,x2,x3,x4,x5,label = get_data(df_train)
model = EnsembleLogistics(x1.shape[1],x2.shape[1],x3.shape[1],x4.shape[1],x5.shape[1])
train_model(model,x1,x2,x3,x4,x5,label,5,20)

In [69]:
#test
x1,x2,x3,x4,x5,label = get_data(df_test)
data_ldr  = D.DataLoader(dataset=EnsembleDatasets(x1,x2,x3,x4,x5,label, 'cpu'),batch_size=len(df_train))
for x1,x2,x3,x4,x5,label in data_ldr:
      yhat = model(x1,x2,x3,x4,x5)
      yhat = (yhat>0.5).float()
      correct = (yhat == label).float().sum()
      print(yhat.shape[0])
      print("Accuracy: {:.3f}".format(correct/label.shape[0]))

740
Accuracy: 0.818


--------------------------------------------------------------------------------

In [None]:
wordToIdx= {}
embedding_matrix = torch.zeros(size=(1917495,300))
i = 0

with open("./glove.42B.300d.txt", 'r') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        vector = torch.from_numpy(np.asarray(values[1:], "float32"))
        embedding_matrix[i]=vector
        wordToIdx[word] = i
        i+=1

In [None]:
embedding = nn.Embedding.from_pretrained(embedding_matrix)

In [None]:
input = torch.LongTensor([1000])
embedding(input)

In [None]:
list(embeddings.keys())[:10]

In [None]:
class tfidf:
  def __init__(self):
    self.tfidf_vectorizer=TfidfVectorizer(use_idf=True,min_df=3, max_features=None, strip_accents='unicode',  
                        analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), 
                        smooth_idf=1,sublinear_tf=1,
                        #stop_words=stopwords.words('english'),
                        )

  def fit(self,docs):
    self.tfidf_vectorizer.fit(docs)
    self.vocab = self.tfidf_vectorizer.get_feature_names()
  def get_keywords(self,docs,count):
    tfidf_vectorizer_vectors = self.tfidf_vectorizer.transform(docs)
    keywords = []
    for vec in tfidf_vectorizer_vectors:
      kws = []
      for i,score in enumerate(list(np.squeeze(np.asarray(vec.T.todense())))):
        if(score>0.0 and self.vocab[i] not in stopwords):
          kws.append([score,self.vocab[i]])
      kws.sort(key = lambda x: x[0],reverse=True) 
      keywords.append([x[1] for x in kws[:count]])
    return keywords