# link prediction with machine learning

In [20]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import operator
import re
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

In [3]:
# import link prediction files
taipei_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/taipei_edge.csv')
telaviv_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/telaviv_edge.csv')
tallinn_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/tallinn_edge.csv')

public_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/public_edge.csv')
corpo_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/corpo_edge.csv')
startup_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/startup_edge.csv')
academic_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/academic_edge.csv')
civil_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/civil_edge.csv')
media_edge = pd.read_csv('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/link prediction files/media_edge.csv')

# 1. Link Prediction

# Taipei

In [92]:
# under sampling
X_tp = taipei_edge.drop(['0','1','isedge?'], axis=1)
y_tp = taipei_edge['isedge?']

In [93]:
y_tp.value_counts()
# highly un-balanced

0    39599685
1        9812
Name: isedge?, dtype: int64

In [94]:
rus = RandomUnderSampler()
X_tpspl, y_tpspl = rus.fit_resample(X_tp, y_tp)

In [95]:
y_tpspl.value_counts()

0    9812
1    9812
Name: isedge?, dtype: int64

In [18]:
X_tptrain, X_tptest, y_tptrain, y_tptest = train_test_split(X_tpspl, y_tpspl, test_size = 0.3, random_state = 35)

In [21]:
rf = RandomForestClassifier()
rf.fit(X_tptrain, y_tptrain)
tp_pred = rf.predict(X_tptest)

In [23]:
print(accuracy_score(y_tptest, tp_pred))
print(f1_score(y_tptest, tp_pred))
print(roc_auc_score(y_tptest, tp_pred))
print(recall_score(y_tptest, tp_pred))
print(precision_score(y_tptest, tp_pred))
print(mean_absolute_error(y_tptest, tp_pred))
print(mean_squared_error(y_tptest, tp_pred))
print(np.sqrt(mean_squared_error(y_tptest, tp_pred)))
print(r2_score(y_tptest, tp_pred))
print(1/(1-r2_score(y_tptest, tp_pred)))

0.9308763586956522
0.9305816135084428
0.9309091320264286
0.9228687415426252
0.9384244926040591
0.06912364130434782
0.06912364130434782
0.26291375259645094
0.7235008408915111
3.6166475269736136


# Tel Aviv

In [32]:
# under sampling
X_tlv = telaviv_edge.drop(['0','1','isedge?'], axis=1)
y_tlv = telaviv_edge['isedge?']

In [33]:
y_tlv.value_counts()
# highly un-balanced

0    214770053
1        23942
Name: isedge?, dtype: int64

In [34]:
rus = RandomUnderSampler()
X_tlvspl, y_tlvspl = rus.fit_resample(X_tlv, y_tlv)

In [35]:
y_tlvspl.value_counts()

0    23942
1    23942
Name: isedge?, dtype: int64

In [36]:
X_tlvtrain, X_tlvtest, y_tlvtrain, y_tlvtest = train_test_split(X_tlvspl, y_tlvspl, test_size = 0.3, random_state = 35)

In [39]:
rf = RandomForestClassifier()
rf.fit(X_tlvtrain, y_tlvtrain)
tlv_pred = rf.predict(X_tlvtest)

In [40]:
print(accuracy_score(y_tlvtest, tlv_pred))
print(f1_score(y_tlvtest, tlv_pred))
print(roc_auc_score(y_tlvtest, tlv_pred))
print(recall_score(y_tlvtest, tlv_pred))
print(precision_score(y_tlvtest, tlv_pred))
print(mean_absolute_error(y_tlvtest, tlv_pred))
print(mean_squared_error(y_tlvtest, tlv_pred))
print(np.sqrt(mean_squared_error(y_tlvtest, tlv_pred)))
print(r2_score(y_tlvtest, tlv_pred))
print(1/(1-r2_score(y_tlvtest, tlv_pred)))

0.9401364332451622
0.9394792399718508
0.9401740393696758
0.9427966101694916
0.9361851332398317
0.05986356675483781
0.05986356675483781
0.24467032258702281
0.7604964865673818
4.175304093321117


# Tallinn

In [41]:
# under sampling
X_tl = tallinn_edge.drop(['0','1','isedge?'], axis=1)
y_tl = tallinn_edge['isedge?']

In [42]:
y_tl.value_counts()
# highly un-balanced

0    25441924
1        8669
Name: isedge?, dtype: int64

In [43]:
X_tlspl, y_tlspl = rus.fit_resample(X_tl, y_tl)

In [44]:
y_tlspl.value_counts()

0    8669
1    8669
Name: isedge?, dtype: int64

In [45]:
X_tltrain, X_tltest, y_tltrain, y_tltest = train_test_split(X_tlspl, y_tlspl, test_size = 0.3, random_state = 35)

In [46]:
rf = RandomForestClassifier()
rf.fit(X_tltrain, y_tltrain)
tl_pred = rf.predict(X_tltest)

In [47]:
print(accuracy_score(y_tltest, tl_pred))
print(f1_score(y_tltest, tl_pred))
print(roc_auc_score(y_tltest, tl_pred))
print(recall_score(y_tltest, tl_pred))
print(precision_score(y_tltest, tl_pred))
print(mean_absolute_error(y_tltest, tl_pred))
print(mean_squared_error(y_tltest, tl_pred))
print(np.sqrt(mean_squared_error(y_tltest, tl_pred)))
print(r2_score(y_tltest, tl_pred))
print(1/(1-r2_score(y_tltest, tl_pred)))

0.9361783929257977
0.9358578052550233
0.9362213799528962
0.926903941829315
0.944986344127975
0.06382160707420223
0.06382160707420223
0.25262938679853186
0.7447081377141389
3.9170852962021083


# Public

In [48]:
# under sampling
X_pub = public_edge.drop(['0','1','isedge?'], axis=1)
y_pub = public_edge['isedge?']

In [49]:
y_pub.value_counts()
# highly un-balanced

0    2338189
1       2179
Name: isedge?, dtype: int64

In [50]:
X_pubspl, y_pubspl = rus.fit_resample(X_pub, y_pub)

In [51]:
y_pubspl.value_counts()

0    2179
1    2179
Name: isedge?, dtype: int64

In [52]:
X_pubtrain, X_pubtest, y_pubtrain, y_pubtest = train_test_split(X_pubspl, y_pubspl, test_size = 0.3, random_state = 35)

In [53]:
rf = RandomForestClassifier()
rf.fit(X_pubtrain, y_pubtrain)
pub_pred = rf.predict(X_pubtest)

In [54]:
print(accuracy_score(y_pubtest, pub_pred))
print(f1_score(y_pubtest, pub_pred))
print(roc_auc_score(y_pubtest, pub_pred))
print(recall_score(y_pubtest, pub_pred))
print(precision_score(y_pubtest, pub_pred))
print(mean_absolute_error(y_pubtest, pub_pred))
print(mean_squared_error(y_pubtest, pub_pred))
print(np.sqrt(mean_squared_error(y_pubtest, pub_pred)))
print(r2_score(y_pubtest, pub_pred))
print(1/(1-r2_score(y_pubtest, pub_pred)))

1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
inf


  print(1/(1-r2_score(y_pubtest, pub_pred)))


# Corpo

In [56]:
# under sampling
X_crp = corpo_edge.drop(['0','1','isedge?'], axis=1)
y_crp = corpo_edge['isedge?']

In [57]:
y_crp.value_counts()
# highly un-balanced

0    31406711
1        7993
Name: isedge?, dtype: int64

In [58]:
X_crpspl, y_crpspl = rus.fit_resample(X_crp, y_crp)

In [59]:
y_crpspl.value_counts()

0    7993
1    7993
Name: isedge?, dtype: int64

In [60]:
X_crptrain, X_crptest, y_crptrain, y_crptest = train_test_split(X_crpspl, y_crpspl, test_size = 0.3, random_state = 35)

In [61]:
rf = RandomForestClassifier()
rf.fit(X_crptrain, y_crptrain)
crp_pred = rf.predict(X_crptest)

In [62]:
print(accuracy_score(y_crptest, crp_pred))
print(f1_score(y_crptest, crp_pred))
print(roc_auc_score(y_crptest, crp_pred))
print(recall_score(y_crptest, crp_pred))
print(precision_score(y_crptest, crp_pred))
print(mean_absolute_error(y_crptest, crp_pred))
print(mean_squared_error(y_crptest, crp_pred))
print(np.sqrt(mean_squared_error(y_crptest, crp_pred)))
print(r2_score(y_crptest, crp_pred))
print(1/(1-r2_score(y_crptest, crp_pred)))

0.999791492910759
0.9997918834547346
0.9997911445279866
1.0
0.9995838535164377
0.0002085070892410342
0.0002085070892410342
0.014439774556447694
0.9991659693224179
1198.9966638865167


# startup

In [63]:
# under sampling
X_str = startup_edge.drop(['0','1','isedge?'], axis=1)
y_str = startup_edge['isedge?']

In [64]:
y_str.value_counts()
# highly un-balanced

0    32687551
1        8193
Name: isedge?, dtype: int64

In [65]:
X_strspl, y_strspl = rus.fit_resample(X_str, y_str)

In [66]:
y_strspl.value_counts()

0    8193
1    8193
Name: isedge?, dtype: int64

In [67]:
X_strtrain, X_strtest, y_strtrain, y_strtest = train_test_split(X_strspl, y_strspl, test_size = 0.3, random_state = 35)

In [68]:
rf = RandomForestClassifier()
rf.fit(X_strtrain, y_strtrain)
str_pred = rf.predict(X_strtest)

In [69]:
print(accuracy_score(y_strtest, str_pred))
print(f1_score(y_strtest, str_pred))
print(roc_auc_score(y_strtest, str_pred))
print(recall_score(y_strtest, str_pred))
print(precision_score(y_strtest, str_pred))
print(mean_absolute_error(y_strtest, str_pred))
print(mean_squared_error(y_strtest, str_pred))
print(np.sqrt(mean_squared_error(y_strtest, str_pred)))
print(r2_score(y_strtest, str_pred))
print(1/(1-r2_score(y_strtest, str_pred)))

0.9997965825874695
0.9997980206018987
0.9997951659156084
1.0
0.9995961227786753
0.0002034174125305126
0.0002034174125305126
0.014262447634628238
0.9991862914271763
1228.9412123677607


# academic

In [70]:
# under sampling
X_aca = academic_edge.drop(['0','1','isedge?'], axis=1)
y_aca = academic_edge['isedge?']

In [71]:
y_aca.value_counts()
# highly un-balanced

0    9016344
1       4287
Name: isedge?, dtype: int64

In [72]:
X_acaspl, y_acaspl = rus.fit_resample(X_aca, y_aca)

In [73]:
y_acaspl.value_counts()

0    4287
1    4287
Name: isedge?, dtype: int64

In [74]:
X_acatrain, X_acatest, y_acatrain, y_acatest = train_test_split(X_acaspl, y_acaspl, test_size = 0.3, random_state = 35)

In [75]:
rf = RandomForestClassifier()
rf.fit(X_acatrain, y_acatrain)
aca_pred = rf.predict(X_acatest)

In [76]:
print(accuracy_score(y_acatest, aca_pred))
print(f1_score(y_acatest, aca_pred))
print(roc_auc_score(y_acatest, aca_pred))
print(recall_score(y_acatest, aca_pred))
print(precision_score(y_acatest, aca_pred))
print(mean_absolute_error(y_acatest, aca_pred))
print(mean_squared_error(y_acatest, aca_pred))
print(np.sqrt(mean_squared_error(y_acatest, aca_pred)))
print(r2_score(y_acatest, aca_pred))
print(1/(1-r2_score(y_acatest, aca_pred)))

0.9992226972405752
0.9992163009404388
0.9992295839753467
1.0
0.9984338292873923
0.000777302759424796
0.000777302759424796
0.027880149917545206
0.9968905404997129
321.599300427511


# civil

In [77]:
# under sampling
X_civ = civil_edge.drop(['0','1','isedge?'], axis=1)
y_civ = civil_edge['isedge?']

In [78]:
y_civ.value_counts()
# highly un-balanced

0    3071474
1       2489
Name: isedge?, dtype: int64

In [79]:
X_civspl, y_civspl = rus.fit_resample(X_civ, y_civ)

In [80]:
y_civspl.value_counts()

0    2489
1    2489
Name: isedge?, dtype: int64

In [81]:
X_civtrain, X_civtest, y_civtrain, y_civtest = train_test_split(X_civspl, y_civspl, test_size = 0.3, random_state = 35)

In [82]:
rf = RandomForestClassifier()
rf.fit(X_civtrain, y_civtrain)
civ_pred = rf.predict(X_civtest)

In [83]:
print(accuracy_score(y_civtest, civ_pred))
print(f1_score(y_civtest, civ_pred))
print(roc_auc_score(y_civtest, civ_pred))
print(recall_score(y_civtest, civ_pred))
print(precision_score(y_civtest, civ_pred))
print(mean_absolute_error(y_civtest, civ_pred))
print(mean_squared_error(y_civtest, civ_pred))
print(np.sqrt(mean_squared_error(y_civtest, civ_pred)))
print(r2_score(y_civtest, civ_pred))
print(1/(1-r2_score(y_civtest, civ_pred)))

0.9973226238286479
0.9972565157750343
0.9973210683450335
0.9972565157750343
0.9972565157750343
0.002677376171352075
0.002677376171352075
0.05174336838042219
0.9892842733801339
93.32078313252985


# media

In [84]:
# under sampling
X_med = media_edge.drop(['0','1','isedge?'], axis=1)
y_med = media_edge['isedge?']

In [85]:
y_med.value_counts()
# highly un-balanced

0    3350150
1       2607
Name: isedge?, dtype: int64

In [86]:
X_medspl, y_medspl = rus.fit_resample(X_med, y_med)

In [87]:
y_medspl.value_counts()

0    2607
1    2607
Name: isedge?, dtype: int64

In [88]:
X_medtrain, X_medtest, y_medtrain, y_medtest = train_test_split(X_medspl, y_medspl, test_size = 0.3, random_state = 35)

In [89]:
rf = RandomForestClassifier()
rf.fit(X_medtrain, y_medtrain)
med_pred = rf.predict(X_medtest)

In [90]:
print(accuracy_score(y_medtest, med_pred))
print(f1_score(y_medtest, med_pred))
print(roc_auc_score(y_medtest, med_pred))
print(recall_score(y_medtest, med_pred))
print(precision_score(y_medtest, med_pred))
print(mean_absolute_error(y_medtest, med_pred))
print(mean_squared_error(y_medtest, med_pred))
print(np.sqrt(mean_squared_error(y_medtest, med_pred)))
print(r2_score(y_medtest, med_pred))
print(1/(1-r2_score(y_medtest, med_pred)))

0.9987220447284345
0.9987228607918263
0.9987228607918264
1.0
0.9974489795918368
0.0012779552715654952
0.0012779552715654952
0.03574850027015812
0.9948881768266193
195.62492012779444
