In [532]:
import pandas as pd
import networkx as nx
from tqdm import tqdm
import itertools
from sys import maxsize
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder

In [551]:
data = pd.read_csv("data2.csv")

In [514]:
data = data[:].loc[data.Source == 'Climate Dynamics']

# 1. Initiallize your classification set as follows

In [515]:
def graph_assemble(d, year_begin, year_end):
    data = d.copy()
    data["Authors"] = data["Authors"].apply(lambda x: x.split("; "))
    data["Authors"] = data["Authors"].apply(lambda x: x[:200] if len(x) > 200 else x)

    d1 = data["Authors"].loc[(data.Year >= year_begin) & (data.Year <= year_end - 4)].values
    d2 = data["Authors"].loc[(data.Year >= year_end - 4) & (data.Year <= year_end)].values

    coauthors1, coauthors2 = {}, {}

    for art in tqdm(d1):
        for x in itertools.permutations(art, r=2):
            coauthors1[x] = coauthors1.get(x, 0) + 1

    for art in tqdm(d2):
        for x in itertools.permutations(art, r=2):
            coauthors2[x] = coauthors2.get(x, 0) + 1

    G1, G2 = nx.Graph(), nx.Graph()
    G1.add_edges_from([key + tuple([{'weight': value}]) for key,value in coauthors1.items()])
    G2.add_edges_from([key + tuple([{'weight': value}]) for key,value in coauthors2.items()])
    
    return G1, G2

In [542]:
def make_sample(G1, G2, year):
    node2vec = Node2Vec(G1)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
    comp_G1 = nx.complement(G1)
    for v1, v2 in comp_G1.edges():
        if (v1 in G2.nodes() and v2 in G2.nodes()):
            nb = len(list(nx.common_neighbors(G1, v1, v2)))
            jc = list(nx.jaccard_coefficient(G1, [(v1, v2)]))[0][2]
            if ((v1, v2) in G2.edges()):
                A.append({'target':1,
                          'Auth1':v1,
                          'Auth2':v2,
                          'shortest_path_length':1,
                          'common_neighbors':nb,
                          'jaccard_coefficient':jc,
                          'year_of_addition':year,
                          'density':nx.density(G1),
                          'node2vek1':edges_embs[(v1, v2)][0],
                          'node2vek2':edges_embs[(v1, v2)][1],
                          'node2vek3':edges_embs[(v1, v2)][2]})
            else: 
                if (nx.has_path(G1, v1, v2)):
                    length = nx.shortest_path_length(G1, v1, v2)
                else:
                    length =  -1
                A.append({'target':0,
                          'Auth1':v1,
                          'Auth2':v2,
                          'shortest_path_length':length,
                          'common_neighbors':nb,
                          'jaccard_coefficient':jc,
                          'year_of_addition':year,
                          'density':nx.density(G1),
                          'node2vek1':edges_embs[(v1, v2)][0],
                          'node2vek2':edges_embs[(v1, v2)][1],
                          'node2vek3':edges_embs[(v1, v2)][2]})

# 2. Construct feature space

In [549]:
A = []
for year in sorted(data['Year'].unique()):
    if (year == 2010):
        break
    G1, G2 = graph_assemble(data, year, year + 8)
    make_sample(G1, G2, year + 8)



  0%|          | 0/4 [00:00<?, ?it/s][A[A

100%|██████████| 4/4 [00:00<00:00, 2328.23it/s][A[A

  0%|          | 0/19 [00:00<?, ?it/s][A[A

100%|██████████| 19/19 [00:00<00:00, 10154.41it/s][A[A

Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s][A[A

Computing transition probabilities: 100%|██████████| 6/6 [00:00<00:00, 3223.91it/s][A[A

Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s][A[A

Generating walks (CPU: 1):  30%|███       | 3/10 [00:00<00:00, 27.10it/s][A[A

Generating walks (CPU: 1):  60%|██████    | 6/10 [00:00<00:00, 27.60it/s][A[A

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:00<00:00, 29.75it/s][A[A

[A[A

  0%|          | 0/17 [00:00<?, ?it/s][A[A

100%|██████████| 17/17 [00:00<00:00, 6784.32it/s][A[A

  0%|          | 0/37 [00:00<?, ?it/s][A[A

100%|██████████| 37/37 [00:00<00:00, 14315.03it/s][A[A

Computing transition probabilities:   0%|          | 0/58 [00:00<?, ?it/s][A[A

Computin

Generating walks (CPU: 1):  90%|█████████ | 9/10 [00:06<00:00,  1.33it/s][A[A

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:07<00:00,  1.30it/s][A[A

[A[A

  0%|          | 0/74 [00:00<?, ?it/s][A[A

100%|██████████| 74/74 [00:00<00:00, 16818.12it/s][A[A

  0%|          | 0/114 [00:00<?, ?it/s][A[A

100%|██████████| 114/114 [00:00<00:00, 17364.56it/s][A[A
[A
Computing transition probabilities:   0%|          | 0/248 [00:00<?, ?it/s][A
Computing transition probabilities:  23%|██▎       | 58/248 [00:00<00:00, 556.86it/s][A
Computing transition probabilities:  30%|███       | 75/248 [00:00<00:00, 318.27it/s][A
Computing transition probabilities:  89%|████████▉ | 221/248 [00:00<00:00, 657.38it/s][A
Computing transition probabilities: 100%|██████████| 248/248 [00:00<00:00, 709.75it/s][A
Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s][A
Generating walks (CPU: 1):  20%|██        | 2/10 [00:00<00:03,  2.22it/s][A
Generating walks (CPU: 1):  30%|

KeyboardInterrupt: 

In [550]:
df = pd.DataFrame(A)

# 3. Choose at least  3 classification algorithms 

In [547]:
A = []
G1, G2 = graph_assemble(data, 2011, 2019)
node2vec = Node2Vec(G1)
model = node2vec.fit(window=10, min_count=1, batch_words=4)
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
comp_G1 = nx.complement(G1)
for v1, v2 in comp_G1.edges():
    if (v1 in G2.nodes() and v2 in G2.nodes()):
            nb = len(list(nx.common_neighbors(G1, v1, v2)))
            jc = list(nx.jaccard_coefficient(G1, [(v1, v2)]))[0][2]
            if ((v1, v2) in G2.edges()):
                A.append({'target':1,
                          'shortest_path_length':1,
                          'common_neighbors':nb,
                          'jaccard_coefficient':jc,
                          'year_of_addition':year,
                          'density':nx.density(G1),
                          'node2vek1':edges_embs[(v1, v2)][0],
                          'node2vek2':edges_embs[(v1, v2)][1],
                          'node2vek3':edges_embs[(v1, v2)][2]})
            else:
                if (nx.has_path(G1, v1, v2)):
                    length = nx.shortest_path_length(G1, v1, v2)
                else:
                    length =  -1
                A.append({'target':0,
                          'shortest_path_length':length,
                          'common_neighbors':nb,
                          'jaccard_coefficient':jc,
                          'year_of_addition':year,
                          'density':nx.density(G1),
                          'node2vek1':edges_embs[(v1, v2)][0],
                          'node2vek2':edges_embs[(v1, v2)][1],
                          'node2vek3':edges_embs[(v1, v2)][2]})
dfTest = pd.DataFrame(A)
yTest = dfTest.target
XTest = dfTest.drop(['target'], axis = 1)
yTrain = df.target
XTrain = df.drop(['target', 'Auth1', 'Auth2'], axis = 1)



  0%|          | 0/123 [00:00<?, ?it/s][A[A

100%|██████████| 123/123 [00:00<00:00, 7874.52it/s][A[A

  0%|          | 0/24 [00:00<?, ?it/s][A[A

100%|██████████| 24/24 [00:00<00:00, 2842.87it/s][A[A

Computing transition probabilities:   0%|          | 0/540 [00:00<?, ?it/s][A[A

Computing transition probabilities:   9%|▉         | 48/540 [00:00<00:01, 474.32it/s][A[A

Computing transition probabilities:  24%|██▍       | 129/540 [00:00<00:00, 635.52it/s][A[A

Computing transition probabilities:  31%|███       | 166/540 [00:00<00:00, 537.42it/s][A[A

Computing transition probabilities:  45%|████▍     | 241/540 [00:00<00:00, 586.03it/s][A[A

Computing transition probabilities:  53%|█████▎    | 288/540 [00:00<00:00, 558.82it/s][A[A

Computing transition probabilities:  62%|██████▏   | 334/540 [00:01<00:00, 289.06it/s][A[A

Computing transition probabilities:  68%|██████▊   | 369/540 [00:02<00:00, 173.83it/s][A[A

Computing transition probabilities:  73%|███████▎

In [548]:
from sklearn.svm import SVC
from sklearn import model_selection

In [521]:
clasif = SVC() #SVM
clasif.fit(XTrain, yTrain)

ans1 = clasif.predict(XTest)

In [522]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, mean_squared_error, roc_auc_score
print ('SVM recall: ', recall_score(yTest, ans1))
print ('SVM precision: ', precision_score(yTest, ans1))
print ('SVM accuracy: ', accuracy_score(yTest, ans1))
print ('SVM f1_score: ', f1_score(yTest, ans1))
print ('SVM mean_squared_error: ', mean_squared_error(yTest, ans1))
print ('AUC-ROC', roc_auc_score(yTest, ans1))
print ("GINI = {}".format(round(2 * roc_auc_score(yTest, ans1) - 1, 2)))

SVM recall:  1.0
SVM precision:  1.0
SVM accuracy:  1.0
SVM f1_score:  1.0
SVM mean_squared_error:  0.0
AUC-ROC 1.0
GINI = 1.0


In [523]:
from sklearn.neighbors import KNeighborsClassifier

clasif2 = KNeighborsClassifier(n_neighbors=3) #k-NN
clasif2.fit(XTrain, yTrain)

ans2 = clasif2.predict(XTest)

In [524]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, mean_squared_error
print ('k-NN recall: ', recall_score(yTest, ans2))
print ('k-NN precision: ', precision_score(yTest, ans2))
print ('k-NN accuracy: ', accuracy_score(yTest, ans2))
print ('k-NN f1_score: ', f1_score(yTest, ans2))
print ('k-NN mean_squared_error: ', mean_squared_error(yTest, ans2))
print ('AUC-ROC', roc_auc_score(yTest, ans2))
print ("GINI = {}".format(round(2 * roc_auc_score(yTest, ans2) - 1, 2)))

k-NN recall:  1.0
k-NN precision:  1.0
k-NN accuracy:  1.0
k-NN f1_score:  1.0
k-NN mean_squared_error:  0.0
AUC-ROC 1.0
GINI = 1.0


In [525]:
from sklearn.tree import DecisionTreeClassifier

clasif3 = DecisionTreeClassifier() #Decision Tree
clasif3.fit(XTrain, yTrain)

ans3 = clasif3.predict(XTest)

In [526]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, mean_squared_error
print ('Decision Tree recall: ', recall_score(yTest, ans3))
print ('Decision Tree precision: ', precision_score(yTest, ans3))
print ('Decision Tree accuracy: ', accuracy_score(yTest, ans3))
print ('Decision Tree f1_score: ', f1_score(yTest, ans3))
print ('Decision Tree mean_squared_error: ', mean_squared_error(yTest, ans3))
print ('AUC-ROC', roc_auc_score(yTest, ans3))
print ("GINI = {}".format(round(2 * roc_auc_score(yTest, ans3) - 1, 2)))

Decision Tree recall:  1.0
Decision Tree precision:  1.0
Decision Tree accuracy:  1.0
Decision Tree f1_score:  1.0
Decision Tree mean_squared_error:  0.0
AUC-ROC 1.0
GINI = 1.0


In [527]:
from sklearn import linear_model
clasif4 = linear_model.LogisticRegression(solver = 'lbfgs')
clasif4.fit(XTrain, yTrain)

ans4 = clasif4.predict(XTest)

In [528]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, mean_squared_error
print ('Logistic Regression recall: ', recall_score(yTest, ans4))
print ('Logistic Regression precision: ', precision_score(yTest, ans4))
print ('Logistic Regression accuracy: ', accuracy_score(yTest, ans4))
print ('Logistic Regression f1_score: ', f1_score(yTest, ans4))
print ('Logistic Regression mean_squared_error: ', mean_squared_error(yTest, ans4))
print ('AUC-ROC', roc_auc_score(yTest, ans4))
print ("GINI = {}".format(round(2 * roc_auc_score(yTest, ans4) - 1, 2)))

Logistic Regression recall:  0.0
Logistic Regression precision:  0.0
Logistic Regression accuracy:  0.9976019184652278
Logistic Regression f1_score:  0.0
Logistic Regression mean_squared_error:  0.002398081534772182
AUC-ROC 0.5
GINI = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
