In [1]:
import os
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.linear_model import Lasso

In [12]:
# read training data
df_train = pd.read_csv('train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# read test data
df_test = pd.read_csv('test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

# load the graph    
G = nx.read_edgelist('coauthorship.edgelist', delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

# computes structural features for each node
core_number = nx.core_number(G)

Number of nodes: 217801
Number of edges: 1718164


In [7]:
df_test

Unnamed: 0.1,Unnamed: 0,author,hindex
0,0,915630815,
1,1,1236455448,
2,2,2694593333,
3,3,2137926699,
4,4,2883694285,
...,...,...,...
43555,43555,2145559725,
43556,43556,2168342616,
43557,43557,2162797290,
43558,43558,294576894,


In [13]:
# create the training matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number 
X_train = np.zeros((n_train, 2))
y_train = np.zeros(n_train)
for i,row in df_train.iterrows():
    node = row['author']
    X_train[i,0] = G.degree(node)
    X_train[i,1] = core_number[node]
    y_train[i] = row['hindex']

# create the test matrix. each node is represented as a vector of 3 features:
# (1) its degree, (2) its core number
X_test = np.zeros((n_test, 2))
for i,row in df_test.iterrows():
    node = row['author']
    X_test[i,0] = G.degree(node)
    X_test[i,1] = core_number[node]
    
# train a regression model and make predictions
reg = Lasso(alpha=0.1)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [14]:
# write the predictions to file
df_test['hindex'] = pd.Series(np.round_(y_pred, decimals=3))

df_test.loc[:,["author","hindex"]].to_csv('submission.csv', index=False)