In [None]:
# This is a continuation of citation_model.ipynb

# The code below that has been commented out is so because after we 
# preprocessed the data, we saved it as a pkl object, then loaded it 
# back in. In order to not preprocess again because it takes up a lot of
# memory. Hence the part that is not commented out starts at loading 
# the preprocessed pkl objects data in to start working with them

In [1]:
# import pickle

# X_pathname = "/Volumes/MasterDrive/ML project data/X.pkl"
# with open(X_pathname, 'rb') as infile_X:
#     X = pickle.load(infile_X)
    
# y_pathname = "/Volumes/MasterDrive/ML project data/y.pkl"
# with open(y_pathname, 'rb') as infile_y:
#     y = pickle.load(infile_y)

In [2]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA

# Xtr, Xts, ytr, yts = train_test_split(X, y, test_size = 0.2)

In [3]:
# tr_shape = ((Xtr.shape[0],Xtr.shape[1]*Xtr.shape[2]))
# ts_shape = ((Xts.shape[0],Xts.shape[1]*Xts.shape[2]))

# scaler = StandardScaler()
# Xtr_scaled = scaler.fit_transform(Xtr.reshape(tr_shape))
# Xts_scaled = scaler.transform(Xts.reshape(ts_shape))

In [4]:
# Xtr_pathname = "/Volumes/MasterDrive/ML project data/Xtr_scaled.pkl"
# with open(Xtr_pathname, 'wb') as output_Xtr:
#     pickle.dump(Xtr_scaled, output_Xtr, pickle.HIGHEST_PROTOCOL)
    
# Xts_pathname = "/Volumes/MasterDrive/ML project data/Xts_scaled.pkl"
# with open(Xts_pathname, 'wb') as output_Xts:
#     pickle.dump(Xts_scaled, output_Xts, pickle.HIGHEST_PROTOCOL)

In [5]:
# pca = PCA(n_components=0.95)
# Xtr_reduced = pca.fit_transform(Xtr_scaled.reshape(tr_shape))
# Xts_reduced = pca.transform(Xts_scaled.reshape(ts_shape))

In [6]:
# Xtr_pathname = "/Volumes/MasterDrive/ML project data/Xtr_reduced.pkl"
# with open(Xtr_pathname, 'wb') as output_Xtr_red:
#     pickle.dump(Xtr_reduced, output_Xtr_red, pickle.HIGHEST_PROTOCOL)
    
# Xts_pathname = "/Volumes/MasterDrive/ML project data/Xts_reduced.pkl"
# with open(Xts_pathname, 'wb') as output_Xts_red:
#     pickle.dump(Xts_reduced, output_Xts_red, pickle.HIGHEST_PROTOCOL)

In [7]:
# ytr_pathname = "/Volumes/MasterDrive/ML project data/ytr.pkl"
# with open(ytr_pathname, 'wb') as output_ytr:
#     pickle.dump(ytr, output_ytr, pickle.HIGHEST_PROTOCOL)

# yts_pathname = "/Volumes/MasterDrive/ML project data/yts.pkl"
# with open(yts_pathname, 'wb') as output_yts:
#     pickle.dump(yts, output_yts, pickle.HIGHEST_PROTOCOL)

In [8]:
import os
import pickle

# Xtr and Xts are the processed data that have been modified 
# via scaling and PCA.

Xtr_pathname = str(os.getcwd()) + "/Xtr_reduced.pkl"
with open(Xtr_pathname, 'rb') as infile_Xtr:
    Xtr_reduced = pickle.load(infile_Xtr)

Xts_pathname = str(os.getcwd()) + "/Xts_reduced.pkl"
with open(Xts_pathname, 'rb') as infile_Xts:
    Xts_reduced = pickle.load(infile_Xts)
    
ytr_pathname = str(os.getcwd()) + "/ytr.pkl"
with open(ytr_pathname, 'rb') as infile_ytr:
    ytr = pickle.load(infile_ytr)
    
yts_pathname = str(os.getcwd()) + "/yts.pkl"
with open(yts_pathname, 'rb') as infile_yts:
    yts = pickle.load(infile_yts)

In [9]:
from sklearn.utils import resample
import numpy as np

# since the number of papers with nonzero citations is so low, upsampling could solve the imbalance

Xtr_majority = Xtr_reduced[ytr==0]
n_class = Xtr_majority.shape[0]
Xtr_minority = Xtr_reduced[ytr==1]

Xtr_minority_upsampled = resample(Xtr_minority, 
                                 replace=True,
                                 n_samples=n_class,
                                 random_state=123)

ytr_majority = ytr[ytr==0]
ytr_minority_upsampled = np.ones(n_class)

Xtr_upsampled = np.concatenate([Xtr_majority,Xtr_minority_upsampled])
ytr_upsampled = np.concatenate([ytr_majority,ytr_minority_upsampled]).astype(int)

permutation = np.random.permutation(Xtr_upsampled.shape[0])

Xtr_upsampled = Xtr_upsampled[permutation]
ytr_upsampled = ytr_upsampled[permutation]

In [17]:
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.model_selection import GridSearchCV

linsvm = LinearSVC()

grid_dict_linsvc = {}
grid_dict_linsvc['C'] = np.logspace(1,11,11) * 1e-6
grid_dict_linsvc['loss'] = ['hinge', 'squared_hinge']
grid_dict_linsvc['max_iter'] = [100000]
grid_list_linsvc = [grid_dict_linsvc]
# F1 score is more appropriate due to imbalanced dataset, despite the upsampling
grid_linsvc = GridSearchCV(estimator=linsvm, param_grid=grid_list_linsvc, scoring='f1', cv=5)
grid_linsvc.fit(Xtr_upsampled, ytr_upsampled)
yhat = grid_linsvc.predict(Xts_reduced)



In [26]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

print(f1_score(yts,yhat))
print(precision_score(yts,yhat))
print(recall_score(yts,yhat))

0.9041572903673796
0.8615384615384616
0.9512121212121212


In [24]:
# from sklearn.linear_model import LogisticRegression

# without resampling, but setting the class_weight hyperparameter in the LinearSVC class to 'balanced'
# logreg = LogisticRegression()
# grid_dict_logreg = {}
# grid_dict_logreg['C'] = np.logspace(1,11,11) * 1e-6
# grid_dict_logreg['solver'] = ['lbfgs', 'liblinear']
# grid_dict_logreg['max_iter'] = [100000]
# grid_list_logreg = [grid_dict_logreg]

# grid_logreg = GridSearchCV(estimator=logreg, param_grid=grid_list_logreg, scoring='f1', cv=5)
# grid_logreg.fit(Xtr_upsampled, ytr_upsampled)
# yhat = grid_logreg.predict(Xts_reduced)

# F1 score is more appropriate due to imbalanced dataset

In [25]:
# print(f1_score(yts,yhat))
# print(precision_score(yts,yhat))
# print(recall_score(yts,yhat))