In [14]:
import networkx as nx
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
# Read data
X = pd.read_csv('X.csv')
X_test = pd.read_csv('X_test.csv')
y = pd.read_csv('y.csv')

In [4]:
# Feature Lists
print(X.columns)

Index(['similarity_A_B', 'similarity_Aout_B', 'similarity_A_Bin', 'aut_common',
       'n_previously_cited', 'reversed', 'adar', 'jaccard', 'jaccard_weak',
       'adar_weak', 'ID_X1', 'Year_X1', 'betweenness_centrality_X1',
       'eigenvector_centrality_X1', 'in_closeness_centrality_X1',
       'in_degree_centrality_X1', 'out_closeness_centrality_X1',
       'out_degree_centrality_X1', 'pagerank_X1', 'infomap_X1', 'ID_X2',
       'Year_X2', 'betweenness_centrality_X2', 'eigenvector_centrality_X2',
       'in_closeness_centrality_X2', 'in_degree_centrality_X2',
       'out_closeness_centrality_X2', 'out_degree_centrality_X2',
       'pagerank_X2', 'infomap_X2'],
      dtype='object')


In [5]:
# Fill nans
X = X.fillna(-9999)
X_test = X_test.fillna(-9999)

In [5]:
# Xgboost
model = XGBClassifier(n_estimators=500, n_jobs=-1, learning_rate=0.3, silent=True)

In [6]:
# Cross-Validation
cross_validate(model, X, y, cv=5, scoring=['f1'], return_train_score=True, n_jobs=-1)

{'fit_time': array([512.69574189, 453.44581246, 441.85493636, 434.37347817,
        453.05194688]),
 'score_time': array([1.7554512 , 1.75712085, 1.92038798, 1.93858004, 1.76907086]),
 'test_f1': array([0.99912039, 0.99938859, 0.99935878, 0.9991949 , 0.99925442]),
 'train_f1': array([0.99996084, 0.99995151, 0.99995711, 0.99994033, 0.99994033])}

In [6]:
# Extra Trees
model = ExtraTreesClassifier(n_estimators=300, n_jobs=-1)

In [7]:
# Cross-Validation
cross_validate(model, X, y, cv=5, scoring=['f1'], return_train_score=True, n_jobs=-1)

{'fit_time': array([254.99908113, 254.84683943, 254.90521121, 253.9053359 ,
        255.06600571]),
 'score_time': array([7.59480667, 7.68340182, 7.40591455, 7.06640387, 7.61987472]),
 'test_f1': array([0.99755137, 0.99755141, 0.99748478, 0.99737334, 0.99744725]),
 'train_f1': array([1., 1., 1., 1., 1.])}

In [23]:
# Logistic Regression
model = LogisticRegression()

In [24]:
# Cross-Validation
cross_validate(model, X, y, cv=5, scoring=['f1'], return_train_score=True, n_jobs=-1)

{'fit_time': array([12.81600881, 11.8828485 , 11.18613791, 10.58151364, 10.76620865]),
 'score_time': array([0.07106113, 0.07173228, 0.07151747, 0.09582305, 0.07607293]),
 'test_f1': array([0.79389447, 0.79324484, 0.79263419, 0.73264024, 0.79398254]),
 'train_f1': array([0.79330178, 0.79315369, 0.79364184, 0.73053258, 0.79309874])}

In [9]:
# Grid Search
param_grid = {'max_depth': [10, 20, None],
              'max_features': [4, 6, 8]}

search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1).fit(X, y)

  self.best_estimator_.fit(X, y, **fit_params)


In [10]:
search.best_params_

{'max_depth': None, 'max_features': 8}

In [12]:
search.best_score_

0.997507765807982

In [7]:
# Fit the model
model.fit(X, y)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.3, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [8]:
# Make predictions
pred = model.predict(X_test)
df_pred = pd.DataFrame({'id':range(len(pred)),
                        'category':pred})
df_pred.to_csv('predictions.csv', index=None)