In [20]:
import random
import pandas as pd
import nltk
import re
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import torch 
import torch.nn as nn
import torch.nn.functional as F 
from collections import OrderedDict
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch import optim
import numpy as np 
# nltk.download('punkt')
description_df = pd.read_csv('dataset/binary_classifiers/description.csv')
installation_df = pd.read_csv('dataset/binary_classifiers/installation.csv')
invocation_df = pd.read_csv('dataset/binary_classifiers/invocation.csv')
citation_df = pd.read_csv('dataset/binary_classifiers/citation.csv')

In [21]:
def lower_stopwords(x):
    x = re.sub(r'[^a-zA-Z\s]', '', x, re.I|re.A)
    x = x.lower()
    x = x.strip()
    text_tokens = [word for word in word_tokenize(x) if word not in stopwords.words()]
    return " ".join(text_tokens)

In [22]:
print(description_df["excerpt"][6])

The original implementation is based on our internal Mxnet version. There are slight differences in the final accuracy and running time due to the plenty details in platform switch.


In [23]:
neg_quant = int(len(description_df) * .375)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(description=False)
description_corpus = pd.concat([description_df.assign(description=True), installation_df.sample(neg_quant).assign(description=False), invocation_df.sample(neg_quant).assign(description=False), citation_df.sample(neg_quant).assign(description=False),treebank_background], sort=False)
description_corpus.drop('URL', 1, inplace=True)
description_corpus.dropna(0, inplace=True)
description_corpus.reset_index(drop=True, inplace=True)
description_corpus["excerpt"] = description_corpus["excerpt"].apply(lower_stopwords)

In [24]:
print(description_corpus.groupby(by = "description").count())

             contributor  excerpt
description                      
False                552      552
True                 503      503


In [25]:
X, y = description_corpus.excerpt, description_corpus.description
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

In [27]:
def curtail(array,threshold = 0.0):
    return array * (array>=threshold)

In [28]:
from sklearn.externals import joblib
vectorizer = CountVectorizer()
X_vect_train = vectorizer.fit_transform(X_train).toarray()
joblib.dump(vectorizer, "model/vectorizer.m")
X_vect_test =  vectorizer.transform(X_test).toarray()

y_hot_train = pd.get_dummies(y_train.values)
y_hot_test = pd.get_dummies(y_test.values)

class FFN(nn.Module):
    
    def __init__(self,layer_arch,input_size,output_size,bias = True):
        super(FFN,self).__init__()
        self.layer_arch = layer_arch
        self.input_size = input_size
        self.output_size = output_size
        self.bias = bias
        self.build_model()
        
    def build_model(self):
        model_arch = []
        unit = self.input_size
        for i,num in enumerate(self.layer_arch):
            model_arch.append(("dense_"+str(i), nn.Linear(unit,num,bias = self.bias)))
            model_arch.append(("nonlinear_"+str(i), nn.ReLU()))
            if(i==1):
                model_arch.append(("dropout_"+str(i), nn.Dropout()))
            unit = num
        model_arch.append(("dense_final",nn.Linear(unit,self.output_size,bias=self.bias)))
        model_arch.append(("act_final",nn.Sigmoid()))
        self.model = nn.Sequential(OrderedDict(model_arch))
    def forward(self,inputs):
        return self.model(inputs)
    

        

In [11]:
model = FFN([1024,2048,1024,512,256],X_vect_train.shape[1],2)
epoch_num = 20
bs = 300
train_ds = TensorDataset(torch.tensor(X_vect_train).float(), torch.tensor(y_hot_train.values).float())
train_dl = DataLoader(train_ds, batch_size=bs)
# opt = optim.SGD(model.parameters(),lr = 0.001)
opt = optim.Adam(model.parameters())
Fs = [0]
for i in range(epoch_num):
    for xb,yb in train_dl:
        target = torch.argmax(yb,dim = 1,keepdim=False)
        pred = model(xb)
        loss_fun = nn.CrossEntropyLoss()
        loss = loss_fun(pred,target)
        loss.backward()
        opt.step()
        opt.zero_grad()        
    #### test metrics #####
    test_pred = model(torch.tensor(X_vect_test).float())
    test_pred = torch.argmax(test_pred,dim=1,keepdim=False)
    ground_truth = torch.argmax(torch.tensor(y_hot_test.values).float(),dim=1,keepdim=False)
    tn, fp, fn, tp = confusion_matrix(test_pred,ground_truth).ravel()

    ### precision ###
    precision = (tp/(tp+fp))*100
    ### recall ###
    recall = (tp/(tp+fn))*100
    ### F-measure ###
    F_measure = (2*precision*recall)/(precision+recall)
    ### accuracy ###
    accuracy = (torch.true_divide((torch.sum((test_pred-ground_truth)==0)),test_pred.shape[0]))*100
    if F_measure > max(Fs):
        torch.save(model.state_dict(), 'model/description.pt')
    Fs.append(F_measure)
    print("test accuracy is {}".format(accuracy))
    print("test precision is {}".format(precision))
    print("test recall is {}".format(recall))
    print("test F-measure is {}".format(F_measure))
    print("************************")


test accuracy is 0
test precision is 100.0
test recall is 49.81132075471698
test F-measure is 66.49874055415617
************************
test accuracy is 0
test precision is 89.39393939393939
test recall is 72.39263803680981
test F-measure is 80.0
************************
test accuracy is 0
test precision is 90.15151515151516
test recall is 71.6867469879518
test F-measure is 79.86577181208054
************************
test accuracy is 0
test precision is 87.12121212121212
test recall is 78.2312925170068
test F-measure is 82.43727598566309
************************
test accuracy is 0
test precision is 90.15151515151516
test recall is 73.91304347826086
test F-measure is 81.22866894197952
************************
test accuracy is 0
test precision is 86.36363636363636
test recall is 76.51006711409396
test F-measure is 81.13879003558718
************************
test accuracy is 0
test precision is 89.39393939393939
test recall is 73.75
test F-measure is 80.82191780821918
*********************

In [12]:
print(max(Fs))

84.04669260700389


In [13]:
X_vect_train.shape[1]

2854

# installation model

In [42]:
neg_quant = int(len(installation_df) * .375)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(installation=False)
# installation_corpus = pd.concat([installation_df.assign(installation=True), description_df.sample(neg_quant,replace = True).assign(installation=False), invocation_df.sample(neg_quant,replace = True).assign(installation=False), citation_df.sample(neg_quant,replace = True).assign(installation=False),treebank_background], sort=False)
installation_corpus = pd.concat([installation_df.assign(installation=True), description_df.sample(neg_quant,replace = True).assign(installation=False), invocation_df.sample(neg_quant,replace = True).assign(installation=False), citation_df.sample(neg_quant,replace = True).assign(installation=False),treebank_background], sort=False)
installation_corpus.drop('URL', 1, inplace=True)
installation_corpus.dropna(0, inplace=True)
installation_corpus.reset_index(drop=True, inplace=True)
installation_corpus["excerpt"] = installation_corpus["excerpt"].apply(lower_stopwords)

In [43]:
print(installation_corpus.groupby(by = "installation").count())

              contributor  excerpt
installation                      
False                1019     1019
True                  929      929


In [44]:
X, y = installation_corpus.excerpt, installation_corpus.installation
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix

In [46]:
def curtail(array,threshold = 0.0):
    return array * (array>=threshold)

In [47]:
from sklearn.externals import joblib
vectorizer = CountVectorizer()
X_vect_train = vectorizer.fit_transform(X_train).toarray()
joblib.dump(vectorizer, "model/vectorizer.m")
X_vect_test =  vectorizer.transform(X_test).toarray()

y_hot_train = pd.get_dummies(y_train.values)
y_hot_test = pd.get_dummies(y_test.values)

In [48]:
model_installation = FFN([1024,2048,1024,512,256],X_vect_train.shape[1],2)
epoch_num = 20
bs = 300
train_ds = TensorDataset(torch.tensor(X_vect_train).float(), torch.tensor(y_hot_train.values).float())
train_dl = DataLoader(train_ds, batch_size=bs)
# opt = optim.SGD(model.parameters(),lr = 0.001)
opt = optim.Adam(model_installation.parameters())
Fs = [0]
for i in range(epoch_num):
    for xb,yb in train_dl:
        target = torch.argmax(yb,dim = 1,keepdim=False)
        pred = model_installation(xb)
        loss_fun = nn.CrossEntropyLoss()
        loss = loss_fun(pred,target)
        loss.backward()
        opt.step()
        opt.zero_grad()        
    #### test metrics #####
    test_pred = model_installation(torch.tensor(X_vect_test).float())
    test_pred = torch.argmax(test_pred,dim=1,keepdim=False)
    ground_truth = torch.argmax(torch.tensor(y_hot_test.values).float(),dim=1,keepdim=False)
    tn, fp, fn, tp = confusion_matrix(test_pred,ground_truth).ravel()

    ### precision ###
    precision = (tp/(tp+fp))*100
    ### recall ###
    recall = (tp/(tp+fn))*100
    ### F-measure ###
    F_measure = (2*precision*recall)/(precision+recall)
    ### accuracy ###
    accuracy = (torch.true_divide((torch.sum((test_pred-ground_truth)==0)),test_pred.shape[0]))*100
    if F_measure > max(Fs):
        torch.save(model.state_dict(), 'model/description.pt')
    Fs.append(F_measure)
    print("test accuracy is {}".format(accuracy))
    print("test precision is {}".format(precision))
    print("test recall is {}".format(recall))
    print("test F-measure is {}".format(F_measure))
    print("************************")




test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 77.82258064516128
test recall is 94.14634146341463
test F-measure is 85.20971302428255
************************
test accuracy is 0
test precision is 87.09677419354838
test recall is 92.3076923076923
test F-measure is 89.62655601659752
************************
test accuracy is 0
test precision is 87.5
test recall is 91.56118143459916
test F-measure is 89.48453608247422
************************
test accuracy is 0
test precision is 82.25806451612904
test recall is 94.88372093023256
test F-measure is 88.12095032397409
************************
test accuracy is 0
test precision is 87.09677419354838
test recall is 92.3076923076923
test F-measure is 89.62655601659752
************************
test accuracy is 0
test precision is 88.70967741935483
test recall is 91.28630705394191
test F-measure is 89.97955010224949
************************
test accuracy 

In [49]:
print(max(Fs))

90.90909090909092


# invocation model

In [62]:
neg_quant = int(len(invocation_df) * .375)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(invocation=False)
# installation_corpus = pd.concat([installation_df.assign(installation=True), description_df.sample(neg_quant,replace = True).assign(installation=False), invocation_df.sample(neg_quant,replace = True).assign(installation=False), citation_df.sample(neg_quant,replace = True).assign(installation=False),treebank_background], sort=False)
invocation_corpus = pd.concat([invocation_df.assign(invocation=True), description_df.sample(neg_quant,replace = True).assign(invocation=False), installation_df.sample(neg_quant,replace = True).assign(invocation=False), citation_df.sample(neg_quant,replace = True).assign(invocation=False),treebank_background], sort=False)
invocation_corpus.drop('URL', 1, inplace=True)
invocation_corpus.dropna(0, inplace=True)
invocation_corpus.reset_index(drop=True, inplace=True)
invocation_corpus["excerpt"] = invocation_corpus["excerpt"].apply(lower_stopwords)

In [63]:
print(invocation_corpus.groupby(by = "invocation").count())

            contributor  excerpt
invocation                      
False              1251     1251
True               1134     1134


In [64]:
X, y = invocation_corpus.excerpt, invocation_corpus.invocation
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [65]:
from sklearn.externals import joblib
vectorizer = CountVectorizer()
X_vect_train = vectorizer.fit_transform(X_train).toarray()
joblib.dump(vectorizer, "model/vectorizer.m")
X_vect_test =  vectorizer.transform(X_test).toarray()

y_hot_train = pd.get_dummies(y_train.values)
y_hot_test = pd.get_dummies(y_test.values)

In [66]:
model_invocation = FFN([1024,2048,1024,512,256],X_vect_train.shape[1],2)
epoch_num = 20
bs = 300
train_ds = TensorDataset(torch.tensor(X_vect_train).float(), torch.tensor(y_hot_train.values).float())
train_dl = DataLoader(train_ds, batch_size=bs)
# opt = optim.SGD(model.parameters(),lr = 0.001)
opt = optim.Adam(model_invocation.parameters())
Fs = [0]
for i in range(epoch_num):
    for xb,yb in train_dl:
        target = torch.argmax(yb,dim = 1,keepdim=False)
        pred = model_invocation(xb)
        loss_fun = nn.CrossEntropyLoss()
        loss = loss_fun(pred,target)
        loss.backward()
        opt.step()
        opt.zero_grad()        
    #### test metrics #####
    test_pred = model_invocation(torch.tensor(X_vect_test).float())
    test_pred = torch.argmax(test_pred,dim=1,keepdim=False)
    ground_truth = torch.argmax(torch.tensor(y_hot_test.values).float(),dim=1,keepdim=False)
    tn, fp, fn, tp = confusion_matrix(test_pred,ground_truth).ravel()

    ### precision ###
    precision = (tp/(tp+fp))*100
    ### recall ###
    recall = (tp/(tp+fn))*100
    ### F-measure ###
    F_measure = (2*precision*recall)/(precision+recall)
    ### accuracy ###
    accuracy = (torch.true_divide((torch.sum((test_pred-ground_truth)==0)),test_pred.shape[0]))*100
    if F_measure > max(Fs):
        torch.save(model.state_dict(), 'model/description.pt')
    Fs.append(F_measure)
    print("test accuracy is {}".format(accuracy))
    print("test precision is {}".format(precision))
    print("test recall is {}".format(recall))
    print("test F-measure is {}".format(F_measure))
    print("************************")



test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 86.12099644128114
test recall is 87.05035971223022
test F-measure is 86.58318425760287
************************
test accuracy is 0
test precision is 91.45907473309609
test recall is 85.66666666666667
test F-measure is 88.46815834767642
************************
test accuracy is 0
test precision is 88.61209964412812
test recall is 86.159169550173
test F-measure is 87.36842105263159
************************
test accuracy is 0
test precision is 91.45907473309609
test recall is 83.98692810457517
test F-measure is 87.56388415672915
************************
test accuracy is 0
test precision is 92.17081850533808
test recall is 80.43478260869566
test F-measure is 85.90381426202322
************************
test accuracy is 0
test precision is 77.935

# citation model

In [67]:
neg_quant = int(len(citation_df) * .375)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(citation=False)
citation_corpus = pd.concat([citation_df.assign(citation=True), description_df.sample(neg_quant,replace = True).assign(citation=False), installation_df.sample(neg_quant,replace = True).assign(citation=False), invocation_df.sample(neg_quant,replace = True).assign(citation=False),treebank_background], sort=False)
citation_corpus.drop('URL', 1, inplace=True)
citation_corpus.dropna(0, inplace=True)
citation_corpus.reset_index(drop=True, inplace=True)
citation_corpus["excerpt"] = citation_corpus["excerpt"].apply(lower_stopwords)

In [68]:
print(citation_corpus.groupby(by = "citation").count())

          contributor  excerpt
citation                      
False             354      354
True              296      296


In [69]:
X, y = citation_corpus.excerpt, citation_corpus.citation
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [70]:
from sklearn.externals import joblib
vectorizer = CountVectorizer()
X_vect_train = vectorizer.fit_transform(X_train).toarray()
joblib.dump(vectorizer, "model/vectorizer.m")
X_vect_test =  vectorizer.transform(X_test).toarray()

y_hot_train = pd.get_dummies(y_train.values)
y_hot_test = pd.get_dummies(y_test.values)

In [71]:
model_citation = FFN([1024,2048,1024,512,256],X_vect_train.shape[1],2)
epoch_num = 20
bs = 300
train_ds = TensorDataset(torch.tensor(X_vect_train).float(), torch.tensor(y_hot_train.values).float())
train_dl = DataLoader(train_ds, batch_size=bs)
# opt = optim.SGD(model.parameters(),lr = 0.001)
opt = optim.Adam(model_citation.parameters())
Fs = [0]
for i in range(epoch_num):
    for xb,yb in train_dl:
        target = torch.argmax(yb,dim = 1,keepdim=False)
        pred = model_citation(xb)
        loss_fun = nn.CrossEntropyLoss()
        loss = loss_fun(pred,target)
        loss.backward()
        opt.step()
        opt.zero_grad()        
    #### test metrics #####
    test_pred = model_citation(torch.tensor(X_vect_test).float())
    test_pred = torch.argmax(test_pred,dim=1,keepdim=False)
    ground_truth = torch.argmax(torch.tensor(y_hot_test.values).float(),dim=1,keepdim=False)
    tn, fp, fn, tp = confusion_matrix(test_pred,ground_truth).ravel()

    ### precision ###
    precision = (tp/(tp+fp))*100
    ### recall ###
    recall = (tp/(tp+fn))*100
    ### F-measure ###
    F_measure = (2*precision*recall)/(precision+recall)
    ### accuracy ###
    accuracy = (torch.true_divide((torch.sum((test_pred-ground_truth)==0)),test_pred.shape[0]))*100
    if F_measure > max(Fs):
        torch.save(model.state_dict(), 'model/description.pt')
    Fs.append(F_measure)
    print("test accuracy is {}".format(accuracy))
    print("test precision is {}".format(precision))
    print("test recall is {}".format(recall))
    print("test F-measure is {}".format(F_measure))
    print("************************")



test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 0.0
test recall is nan
test F-measure is nan
************************
test accuracy is 0
test precision is 35.064935064935064
test recall is 100.0
test F-measure is 51.92307692307691
************************
test accuracy is 0
test precision is 54.54545454545454
test recall is 100.0
test F-measure is 70.58823529411765
************************
test accuracy is 0
test precision is 57.14285714285714
test recall is 100.0
test F-

In [72]:
print(max(Fs))

93.82716049382715


In [73]:
X_vect_train.shape[1]

1733