In [None]:
# !pip install stanford-corenlp
# !pip install stanfordnlp
# !pip install pytorch-transformers
# !pip install transformers

import numpy as np
import pandas as pd
import random
import pickle
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import os,sys,time
import corenlp
import stanfordnlp
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification,BertTokenizer, BertModel, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

sys.path.append('./scripts/') 
stanfordnlp.download('en')

from event_dataset import EventReader, SentenceReader, Parser
import util_sentence_classifier as util_sent_cf

In [None]:
args = {}

args["data_dir"] = "./data"
args["train_file"] = "train.ids"
args["dev_file"] = "dev.ids"
args["test_file"] = "test.ids"
args["emb_file"] = "./data/guten.vectors.txt"

args["batch_size"] = 16
args["emb_size"] = 100
args["hidden_size"] = 100
args["dropout"] = 0.5
args["num_epochs"] = 100
args["learning_rate"] = 0.001
args["bidir"] = True
args["seed"] = 0
args["do_train"] = True
args["do_eval"] = True
args["model"] = "word"
args["save_path"] = None
args["suffix"] = None
args["num_layers"] = 3
args["oov_vocab"] = None

random.seed(args["seed"])
torch.manual_seed(args["seed"])
np.random.seed(args["seed"])

use_cuda = torch.cuda.is_available()

reader = EventReader()
parser = Parser()

train_sentences, train_events = reader.read_events(args["data_dir"],args["train_file"])
# dev_sentences, dev_events = reader.read_events(args["data_dir"], args["dev_file"])
dev_sentences, dev_events = reader.read_events(args["data_dir"], args["test_file"])
test_sentences, test_events = reader.read_events(args["data_dir"], args["test_file"])

train_parse = parser.parse_sequences(train_sentences)
dev_parse = parser.parse_sequences(dev_sentences)
test_parse = parser.parse_sequences(test_sentences)

In [None]:
train_ = util_sent_cf.get_sentences_events(train_sentences,train_events)
dev_ = util_sent_cf.get_sentences_events(dev_sentences,dev_events)
test_ = util_sent_cf.get_sentences_events(test_sentences,test_events)

train_dataloader = util_sent_cf.tokenize_sentences_make_dataloader(train_,2)
dev_dataloader = util_sent_cf.tokenize_sentences_make_dataloader(dev_,2)
test_dataloader = util_sent_cf.tokenize_sentences_make_dataloader(test_,2)

In [None]:
def bert_preprocess_data(data,tokenizer,model):
    sentence_vectors = []
    sentence_labels = []
    for index in range(len(data)):
        inputs = tokenizer(data.sentence[index], return_tensors="pt")
        outputs = model(**inputs)
        sentence_vectors.append(outputs.pooler_output.detach().numpy()[0])
        del outputs
        del inputs
        sentence_labels.append(data.label[index])
    df_data = pd.DataFrame({"sentence":sentence_vectors,"label":sentence_labels})
    return df_data

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model2 = RobertaModel.from_pretrained("roberta-base")

model = RobertaForSequenceClassification.from_pretrained("roberta-base",num_labels = 2)
model.load_state_dict(torch.load("./trained_model/model_sent_2_0.8705583756345178",map_location=torch.device('cpu'))["model_state_dict"])
# model.to("cpu")

# comment the next two lines to use the wihtout finetuned RoBERTa
model2.encoder.load_state_dict(model.roberta.encoder.state_dict())
model2.embeddings.load_state_dict(model.roberta.embeddings.state_dict())

svm_train_data = bert_preprocess_data(train_,tokenizer,model2)
svm_dev_data = bert_preprocess_data(dev_,tokenizer,model2)
svm_test_data = bert_preprocess_data(test_,tokenizer,model2)

In [None]:
'''
tSNE plot
'''
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

X_plot = TSNE(n_components=2, learning_rate='auto',init='random').fit_transform(np.array(list(svm_train_data.sentence)))

plt.figure(figsize=(10, 10))
y_pred = clf.predict(list(svm_train_data.sentence))
# error = []
# for i in range(len(svm_train_data)):
#         if(y_pred[i] != svm_train_data.label[i]):
#             error.append(1)
#         else:
#             error.append(0)
sns.scatterplot(X_plot[:,0],X_plot[:,1],hue=np.array(error))

In [None]:
'''
SVM classifier
'''
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics

clf_svm = make_pipeline(StandardScaler(), SVC(kernel="rbf",C=2, gamma=0.001))
clf_svm.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

y_pred = clf_svm.predict(list(svm_dev_data.sentence))
print("Accuracy:",metrics.accuracy_score(np.array(svm_dev_data.label), y_pred)*100)
print("Precision:",metrics.precision_score(np.array(svm_dev_data.label), y_pred)*100)
print("Recall:",metrics.recall_score(np.array(svm_dev_data.label), y_pred)*100)

print("No. of support vectors per data point : ",len(clf_svm[1].support_vectors_)/len(train_))
print("No. of support vectors : ",len(clf_svm[1].support_vectors_))

In [None]:
dataframe = pd.DataFrame(dev_.sentence[wrong_index],copy=True)
dataframe = dataframe.reset_index().drop(columns="index")
dataframe.to_csv("./out_all_1.csv")
# dataframe

In [None]:
for i in dataframe.sentence:
    print(i,"\n")

In [None]:
'''
Random Forest classifier
'''
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

clf_rf = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=15, random_state=1))
clf_rf.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

y_pred = clf_rf.predict(list(svm_dev_data.sentence))
print("Accuracy:",metrics.accuracy_score(np.array(svm_dev_data.label), y_pred)*100)
print("Precision:",metrics.precision_score(np.array(svm_dev_data.label), y_pred)*100)
print("Recall:",metrics.recall_score(np.array(svm_dev_data.label), y_pred)*100)

In [None]:
'''
XGBoost classifier
'''
import xgboost as xgb

clf_xg = make_pipeline(StandardScaler(), xgb.XGBClassifier(objective='binary:logistic', n_estimators=16))
clf_xg.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

y_pred = clf_xg.predict(list(svm_dev_data.sentence))
print("Accuracy:",metrics.accuracy_score(np.array(svm_dev_data.label), y_pred)*100)
print("Precision:",metrics.precision_score(np.array(svm_dev_data.label), y_pred)*100)
print("Recall:",metrics.recall_score(np.array(svm_dev_data.label), y_pred)*100)

In [None]:
'''
Grid Search
'''
# from sklearn.model_selection import GridSearchCV

# parameters = {'kernel':['rbf'], 'C':[2*x for x in range(1,15)],'gamma':[0.01*x for x in range(1,90)]}
# clf_grid = GridSearchCV(SVC(), parameters,verbose=3)
# clf_grid.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

In [None]:
'''
KNN classifier
'''
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

y_pred = knn.predict(list(svm_dev_data.sentence))
print("Accuracy:",metrics.accuracy_score(np.array(svm_dev_data.label), y_pred)*100)
print("Precision:",metrics.precision_score(np.array(svm_dev_data.label), y_pred))
print("Recall:",metrics.recall_score(np.array(svm_dev_data.label), y_pred))

In [None]:
'''
linear classifier over Roberta embeddings
'''
from sklearn.linear_model import SGDClassifier

clf2 = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
clf2.fit(list(svm_train_data.sentence), np.array(svm_train_data.label))

y_pred = clf2.predict(list(svm_dev_data.sentence))

print("Accuracy:",metrics.accuracy_score(np.array(svm_dev_data.label), y_pred)*100)
print("Precision:",metrics.precision_score(np.array(svm_dev_data.label), y_pred))
print("Recall:",metrics.recall_score(np.array(svm_dev_data.label), y_pred))

In [None]:
'''
Analysis to get intersection
'''

y_pred = {}
y_pred["svm"] = clf_svm.predict(list(svm_dev_data.sentence))
y_pred["rf"] = clf_rf.predict(list(svm_dev_data.sentence))
y_pred["xg"] = clf_xg.predict(list(svm_dev_data.sentence))
y_pred["rf"] = knn.predict(list(svm_dev_data.sentence))
y_pred["linear"] = clf2.predict(list(svm_dev_data.sentence))

wrong_index = []
# for j in range(12):
#     count = 0
#     actual_count = 0
for i in range(len(svm_dev_data)):
#         if(dev_.no_of_events[i]==j):
#             actual_count += 1
    true_count = np.sum([y_pred[key][i] != svm_dev_data.label[i] for key in y_pred])
    if(true_count==len(y_pred.keys()) and dev_.no_of_events[i]>0):
#         print(dev_.sentence[i],dev_.no_of_events[i],"\n")
        wrong_index.append(i)
#             count += 1
#     print(j," : ",count,"...",actual_count," fraction = ",(count*100/actual_count))

In [None]:
'''
Roberta-sentence classifier (freeze / non-freeze transformer layers)
'''
freeze_transformer_layers = False

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = RobertaForSequenceClassification.from_pretrained("roberta-base",num_labels = 2)

if(freeze_transformer_layers):
    for param in model.roberta.parameters():
        param.requires_grad = False

optim = AdamW(model.parameters(), lr=1e-5)

best_acc = 0
model.to(device)
for epoch in range(30):
    epoch_loss=0
    model.train()
    for batch in train_dataloader:
        optim.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,labels=b_labels)
        loss = outputs[0]
        loss.backward()
        batch_loss=loss.item()
        optim.step()
        epoch_loss+=batch_loss
    acc = util_sent_cf.test(model,dev_dataloader,device)
    normalized_epoch_loss = epoch_loss/(len(train_dataloader))
    if(best_acc<acc):
        best_acc = acc
#         torch.save(model,"model_sent_"+str(best_acc))
        torch.save({
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'loss': normalized_epoch_loss,
            'dev_accuracy': acc
            }, "model_sent_"+str(epoch+1)+"_"+str(acc))
    
    print("Epoch {}".format(epoch+1))
    print("Epoch loss: {} ".format(normalized_epoch_loss))

In [None]:
'''
N-gram data generation
'''
from numpy import linalg as LA

def get_dimensions(train_data,dev_data):
    bigram_to_index = {}
    index_to_bigram = []

    for index,row in train_data.iterrows():
        #label = int(row["label"])
        try:
            if row["sentence"][-1] == '.':
                row["sentence"] = row["sentence"][:-1]
        except:
            print(index)
            print(row["sentence"])
            continue
        unigrams = row["sentence"].split(" ")[:-1]
        bigrams = [b for b in zip(row["sentence"].split(" ")[:-1], row["sentence"].split(" ")[1:])]
        bigrams += unigrams
        for b in bigrams:
            key = " ".join(b)
            if key not in bigram_to_index.keys():
                bigram_to_index[key] = len(index_to_bigram)
                index_to_bigram.append(key)
        if (index+1)%1000 == 0:
            print("{} examples finshed".format(index+1))

    for index,row in dev_data.iterrows():
        #label = int(row["label"])
        if row["sentence"][-1] == '.':
            row["sentence"] = row["sentence"][:-1]
        unigrams = row["sentence"].split(" ")[:-1]
        bigrams = [b for b in zip(row["sentence"].split(" ")[:-1], row["sentence"].split(" ")[1:])]
        bigrams += unigrams
        for b in bigrams:
            key = " ".join(b)
            if key not in bigram_to_index.keys():
                bigram_to_index[key] = len(index_to_bigram)
                index_to_bigram.append(key)
        if (index+1)%1000 == 0:
            print("{} examples finished".format(index+1))

    return bigram_to_index, index_to_bigram


def get_data(data, bigram_to_index):
    trainable_data = np.zeros((1,len(bigram_to_index)))
    labels = np.array([])
    for index,row in data.iterrows():
        labels = np.append(labels,int(row["label"]))
        data_point = np.zeros((1,len(bigram_to_index)))
        if row["sentence"][-1] == '.':
            row["sentence"] = row["sentence"][:-1]
        unigrams = row["sentence"].split(" ")[:-1]
        bigrams = [b for b in zip(row["sentence"].split(" ")[:-1], row["sentence"].split(" ")[1:])]
        bigrams += unigrams
        for b in bigrams:
            key = " ".join(b)
            data_point[0][bigram_to_index[key]] = 1
        trainable_data = np.append(trainable_data,data_point,axis=0)

        if (index+1)%1000 == 0:
            print("{} examples finished".format(index))

    trainable_data = trainable_data[1:]
    return trainable_data, labels

bigram_to_index, index_to_bigram = get_dimensions(train_,dev_)
train_data_final = get_data(train_,bigram_to_index)
dev_data_final = get_data(dev_,bigram_to_index)

In [None]:
'''
SVM classifier for n-gram
'''
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import metrics

clf = make_pipeline(StandardScaler(), SVC(kernel="linear",gamma='auto',C=24))
clf.fit(train_data_final[0], train_data_final[1])

y_pred = clf.predict(dev_data_final[0])
print("Accuracy:",metrics.accuracy_score(dev_data_final[1], y_pred)*100)
print("Precision:",metrics.precision_score(dev_data_final[1], y_pred))
print("Recall:",metrics.recall_score(dev_data_final[1], y_pred))

print("No. of support vectors per data point : ",len(clf[1].support_vectors_)/len(train_))
print("No. of support vectors : ",len(clf[1].support_vectors_))