## Packages

In [3]:
import csv
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn import model_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
#evaluate performances
from sklearn.metrics import make_scorer, accuracy_score 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel

In [4]:
#nltk.download('punkt') # for tokenization
#nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

In [5]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

IDs = [element[0] for element in node_info]

# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)

In [8]:
edges

[('9510123', '9502114'),
 ('9707075', '9604178'),
 ('9710020', '9709228'),
 ('9901042', '9510135'),
 ('9705079', '9702201'),
 ('9705061', '9503216'),
 ('207246', '9807137'),
 ('9501144', '9311081'),
 ('110087', '9711200'),
 ('9501105', '9202069'),
 ('9708164', '9510017'),
 ('11109', '3298'),
 ('9601055', '9404188'),
 ('9239', '9203051'),
 ('2094', '9811089'),
 ('9806087', '9308075'),
 ('205131', '204051'),
 ('9308124', '9209023'),
 ('105155', '9806044'),
 ('207104', '102135'),
 ('207160', '5204'),
 ('302049', '9706144'),
 ('9503087', '9409185'),
 ('112217', '9802015'),
 ('203124', '111165'),
 ('9705192', '9601029'),
 ('9902073', '9604156'),
 ('9038', '2237'),
 ('9412228', '9405109'),
 ('9808168', '9403050'),
 ('6216', '9510083'),
 ('211170', '9312104'),
 ('9802121', '9701088'),
 ('9909164', '9909030'),
 ('205034', '9810239'),
 ('1016', '9806087'),
 ('8151', '4056'),
 ('105075', '9205115'),
 ('9502159', '9408096'),
 ('9410030', '9306069'),
 ('10244', '9037'),
 ('6073', '9812230'),
 ('30

## Creating the graph

In [7]:
## the following shows how to construct a graph with igraph
## even though in this baseline we don't use it
## look at http://igraph.org/python/doc/igraph.Graph-class.html for feature ideas

edges = [(element[0],element[1]) for element in training_set if element[2]=="1"]

## some nodes may not be connected to any other node
## hence the need to create the nodes of the graph from node_info.csv,
## not just from the edge list

nodes = IDs

## create empty directed graph
g = igraph.Graph(directed=True)
 
## add vertices
g.add_vertices(nodes)
 
## add edges
g.add_edges(edges)

# for each training example we need to compute features
# in this baseline we will train the model on only 5% of the training set

## Defining our training set

In [5]:
# randomly select 5% of training set

to_keep = random.sample(range(len(training_set)), k=int(round(len(training_set)*0.05)))
training_set_reduced = [training_set[i] for i in to_keep]


# number of overlapping words in title
overlap_title = []

# temporal distance between the papers
temp_diff = []

# number of common authors
comm_auth = []

#Cosine similarity
cosimi= []

#Jaccard Similarity
jacc = []

#Preferential attachment
pref_att=[]

#Adding new features
counter = 0
for i in range(len(training_set_reduced)):
    source = training_set_reduced[i][0]
    target = training_set_reduced[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    
    #SEMANTIC Features
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    
        #TF-iDF Cosine similarity
    cosimi.append(linear_kernel(features_TFIDF[index_source:index_source+1],
                                features_TFIDF[index_target:index_target+1])[0][0])
    
	# convert to lowercase and tokenize
    source_title = source_info[2].lower().split(" ")
	# remove stopwords
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title.append(len(set(source_title).intersection(set(target_title))))
    temp_diff.append(int(source_info[1]) - int(target_info[1]))
    comm_auth.append(len(set(source_auth).intersection(set(target_auth))))
    
    
   #Topological Features
        #Preferential attachment
    deg_source = len(g.neighbors(source,mode = 'OUT'))
    deg_target = len(g.neighbors(target, mode ='IN'))
    pref_att.append(deg_source*deg_target)
        #Jaccard coefficient
    jacc.append(g.similarity_jaccard( vertices = (source, target))[0][1])
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "training examples processsed")

# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
training_features = np.array([jacc,cosimi, pref_att, overlap_title, temp_diff, comm_auth]).T

# scale
training_features = preprocessing.scale(training_features)

# convert labels into integers then into column array
labels = [int(element[2]) for element in training_set_reduced]
labels = list(labels)
labels_array = np.array(labels)

1 training examples processsed
1001 training examples processsed
2001 training examples processsed
3001 training examples processsed
4001 training examples processsed
5001 training examples processsed
6001 training examples processsed
7001 training examples processsed
8001 training examples processsed
9001 training examples processsed
10001 training examples processsed
11001 training examples processsed
12001 training examples processsed
13001 training examples processsed
14001 training examples processsed
15001 training examples processsed
16001 training examples processsed
17001 training examples processsed
18001 training examples processsed
19001 training examples processsed
20001 training examples processsed
21001 training examples processsed
22001 training examples processsed
23001 training examples processsed
24001 training examples processsed
25001 training examples processsed
26001 training examples processsed
27001 training examples processsed
28001 training examples processse

## Defining our test set

In [6]:
#Test processing

overlap_title_test = []
temp_diff_test = []
comm_auth_test = []
pref_att_test = []
cosimi_test= []
jacc_test= []

counter = 0
for i in range(len(testing_set)):
    source = testing_set[i][0]
    target = testing_set[i][1]
    
    index_source = IDs.index(source)
    index_target = IDs.index(target)
    
    
    #SEMANTIC Features
    source_info = [element for element in node_info if element[0]==source][0]
    target_info = [element for element in node_info if element[0]==target][0]
    
    
        #Cosine similarity
    cosimi_test.append(linear_kernel(features_TFIDF[index_source:index_source+1],
                                features_TFIDF[index_target:index_target+1])[0][0])
    
    source_title = source_info[2].lower().split(" ")
    source_title = [token for token in source_title if token not in stpwds]
    source_title = [stemmer.stem(token) for token in source_title]
    
    target_title = target_info[2].lower().split(" ")
    target_title = [token for token in target_title if token not in stpwds]
    target_title = [stemmer.stem(token) for token in target_title]
    
    source_auth = source_info[3].split(",")
    target_auth = target_info[3].split(",")
    
    overlap_title_test.append(len(set(source_title).intersection(set(target_title))))
    temp_diff_test.append(int(source_info[1]) - int(target_info[1]))
    comm_auth_test.append(len(set(source_auth).intersection(set(target_auth))))
   
    #TOPOLOGICAL Features
        #Preferential attachment
    deg_source = len(g.neighbors(source,mode = 'OUT'))
    deg_target = len(g.neighbors(target, mode ='IN'))
    pref_att_test.append(deg_source*deg_target)
        #Jaccard coeficient
    jacc_test.append(g.similarity_jaccard( vertices = (source, target))[0][1])
    
    counter += 1
    if counter % 1000 == True:
        print(counter, "testing examples processsed")
        
# convert list of lists into array
# documents as rows, unique words as columns (i.e., example as rows, features as columns)
testing_features = np.array([jacc_test, cosimi_test, pref_att_test, overlap_title_test, temp_diff_test, comm_auth_test]).T

# scale
testing_features = preprocessing.scale(testing_features)

1 testing examples processsed
1001 testing examples processsed
2001 testing examples processsed
3001 testing examples processsed
4001 testing examples processsed
5001 testing examples processsed
6001 testing examples processsed
7001 testing examples processsed
8001 testing examples processsed
9001 testing examples processsed
10001 testing examples processsed
11001 testing examples processsed
12001 testing examples processsed
13001 testing examples processsed
14001 testing examples processsed
15001 testing examples processsed
16001 testing examples processsed
17001 testing examples processsed
18001 testing examples processsed
19001 testing examples processsed
20001 testing examples processsed
21001 testing examples processsed
22001 testing examples processsed
23001 testing examples processsed
24001 testing examples processsed
25001 testing examples processsed
26001 testing examples processsed
27001 testing examples processsed
28001 testing examples processsed
29001 testing examples proc

In [7]:
#Simple separation between training and testing(validation)
X_train, X_test, y_train, y_test = train_test_split(training_features, labels_array, 
                                                            test_size=0.05, random_state=21, stratify = labels_array)

In [8]:
#Creating cross validation data splits
cv_sets = model_selection.StratifiedShuffleSplit(n_splits = 3, test_size = 0.05, random_state = 5)
cv_sets.get_n_splits(X_train, y_train)

3

## SVM Classifier

In [9]:
linsvc_clf = svm.LinearSVC()

parameters_linsvc = {"fit_intercept": [True, False], "max_iter": [1000, 1500, 2000]}

grid_linsvc = GridSearchCV(linsvc_clf, parameters_linsvc, scoring=make_scorer(accuracy_score), cv = cv_sets)
grid_linsvc.fit(X_train, y_train)

linsvc_clf = grid_linsvc.best_estimator_

linsvc_clf.fit(X_train, y_train)
pred_linsvc = linsvc_clf.predict(X_test)
acc_linsvc = accuracy_score(y_test, pred_linsvc)

print("The Score for LinearSVC is: " + str(acc_linsvc))
grid_linsvc.best_params_



The Score for LinearSVC is: 0.9837556855100714


{'fit_intercept': True, 'max_iter': 1000}

In [33]:
# issue predictions
predictions_SVM = list(linsvc_clf.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_SVM = zip(range(len(testing_set)), predictions_SVM)

with open("improved_predictionsSvm.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in predictions_SVM:
        csv_out.writerow(row)

## XGBoost with grid search

In [27]:
# XGBOOST
from sklearn.model_selection import RandomizedSearchCV,StratifiedKFold,GridSearchCV
from xgboost import XGBClassifier
# A parameter grid for XGBoost which gives us the best parameters below.
params = {
        'min_child_weight': [1],
        'gamma': [0.4,0.6,0.8],
        'subsample': [1.0,1.5],
        'colsample_bytree': [0.6,1],
        'max_depth': [3]
        }
#Params with 5% of training that gave us best results    
#params = {'min_child_weight': [1], 'gamma':[1.5] , 'subsample' : [1.0], 'colsample_bytree' :[0.6] , 'max_depth' : [3]}

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',silent=True, nthread=1)
grid = GridSearchCV(estimator=xgb, param_grid=params, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3 )
grid.fit(X_train, y_train)


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    4.7s finished


GridSearchCV(cv=<generator object _BaseKFold.split at 0x000001AACA8D1D58>,
       error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=600,
       n_jobs=1, nthread=1, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'min_child_weight': [1], 'gamma': [0.4], 'subsample': [1.0], 'colsample_bytree': [0.6], 'max_depth': [3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=3)

In [24]:
grid.best_params_

{'colsample_bytree': 0.6,
 'gamma': 1.5,
 'max_depth': 3,
 'min_child_weight': 1,
 'subsample': 1.0}

In [28]:
y_pred = grid.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.22%


In [26]:
predictions_XGBOOST2 = list(grid.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_XGBOOST2 = zip(range(len(testing_set)), predictions_XGBOOST2)

with open("improved_predictionsXGboost.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in predictions_XGBOOST2:
        csv_out.writerow(row)

## XGboost with default parameters

In [29]:
# fit model no training data
model=XGBClassifier()

model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.22%


In [30]:
predictions_XGBOOST = list(model.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_XGBOOST = zip(range(len(testing_set)), predictions_XGBOOST)

with open("improved_predictions.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in predictions_XGBOOST:
        csv_out.writerow(row)

## CatBoost Classifier

In [31]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(depth=6, iterations=1000, learning_rate=0.1, thread_count=16, logging_level='Silent')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 98.90%


In [32]:
predictions_CatBoost = list(clf.predict(testing_features))

# write predictions to .csv file suitable for Kaggle (just make sure to add the column names)
predictions_CatBoost = zip(range(len(testing_set)), predictions_CatBoost)

with open("improved_predictionsCatboost.csv","w") as pred1:
    csv_out = csv.writer(pred1)
    for row in predictions_CatBoost:
        row_1=str(row[1]).split(".")[0]
        csv_out.writerow((row[0],row_1))