# 4. feature engineering for link prediction

### extract link prediction measures from subgraphs

In [280]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import operator
import re
import matplotlib.pyplot as plt

from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# import subgraph files
taipei_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/taipei_lstsub.graphml')
telaviv_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/telaviv_lstsub.graphml')
tallinn_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/tallinn_lstsub.graphml')

public_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/public_lstsub.graphml')
corpo_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/corpo_lstsub.graphml')
startup_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/startup_lstsub.graphml')
academic_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/academic_lstsub.graphml')
civil_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/civil_lstsub.graphml')
media_lstsub = nx.read_graphml('/Users/juliencarbonnell/Desktop/Thèse/DONNÉES/1.Twitter/SNA/subgraph files/media_lstsub.graphml')

# 1. Link prediction

### Build a dataframe of edges and non-edges of each network

In [229]:
# taipei
taipei_edge = pd.DataFrame(list(nx.non_edges(taipei_lstsub)))
taipei_edge2 = pd.DataFrame(list(taipei_lstsub.edges()))

In [230]:
taipei_edge['isedge?'] = 0
taipei_edge2['isedge?'] = 1

In [231]:
taipei_edge = pd.concat([taipei_edge, taipei_edge2], axis=0)
taipei_edge

Unnamed: 0,0,1,isedge?
0,corverstag,BarryJenkins,0
1,corverstag,SolihinNizam,0
2,corverstag,carnett_jeff,0
3,corverstag,tjhanour,0
4,corverstag,halmarsh5,0
...,...,...,...
9807,SinicaFans,YitHengChooi,1
9808,SinicaFans,yitang_ucla,1
9809,SinicaFans,GranularLab,1
9810,SinicaFans,apcrs2018,1


In [232]:
# tel aviv
telaviv_edge = pd.DataFrame(list(nx.non_edges(telaviv_lstsub))) 
telaviv_edge2 = pd.DataFrame(list(telaviv_lstsub.edges())) 

In [233]:
telaviv_edge['isedge?'] = 0
telaviv_edge2['isedge?'] = 1

In [234]:
telaviv_edge = pd.concat([telaviv_edge, telaviv_edge2], axis=0)
telaviv_edge

Unnamed: 0,0,1,isedge?
0,rothem,irmolicr7,0
1,rothem,MarleeMatlin,0
2,rothem,ChrisBarrett,0
3,rothem,yoavshaham,0
4,rothem,Fredoev,0
...,...,...,...
23937,moovit,cluedont,1
23938,moovit,transantiago,1
23939,moovit,C_Fesen,1
23940,moovit,SophieB_94,1


In [235]:
# tallinn
tallinn_edge = pd.DataFrame(list(nx.non_edges(tallinn_lstsub))) 
tallinn_edge2 = pd.DataFrame(list(tallinn_lstsub.edges())) 

In [236]:
tallinn_edge['isedge?'] = 0
tallinn_edge2['isedge?'] = 1

In [237]:
tallinn_edge = pd.concat([tallinn_edge, tallinn_edge2], axis=0)
tallinn_edge

Unnamed: 0,0,1,isedge?
0,israelmazda,FilippovEU,0
1,israelmazda,ahtih,0
2,israelmazda,normis,0
3,israelmazda,KIC_Madeleine,0
4,israelmazda,GCCowgirl,0
...,...,...,...
8664,Adcash,CasualConnect,1
8665,Adcash,NewZooHQ,1
8666,Adcash,comScore,1
8667,Adcash,24heuresdumans,1


In [238]:
# public
public_edge = pd.DataFrame(list(nx.non_edges(public_lstsub))) 
public_edge2 = pd.DataFrame(list(public_lstsub.edges())) 

In [239]:
public_edge['isedge?'] = 0
public_edge2['isedge?'] = 1

In [240]:
public_edge = pd.concat([public_edge, public_edge2], axis=0)
public_edge

Unnamed: 0,0,1,isedge?
0,FilippovEU,Nutiteq,0
1,FilippovEU,ahtih,0
2,FilippovEU,SATUlaboratory,0
3,FilippovEU,TimHarcourt,0
4,FilippovEU,CREAcademy,0
...,...,...,...
2174,startupestonia,BalticStartup,1
2175,startupestonia,kuratcom,1
2176,startupestonia,aaltoes,1
2177,startupestonia,business_design,1


In [241]:
# corpo
corpo_edge = pd.DataFrame(list(nx.non_edges(corpo_lstsub)))
corpo_edge2 = pd.DataFrame(list(corpo_lstsub.edges()))

In [242]:
corpo_edge['isedge?'] = 0
corpo_edge2['isedge?'] = 1

In [243]:
corpo_edge = pd.concat([corpo_edge, corpo_edge2], axis=0)
corpo_edge

Unnamed: 0,0,1,isedge?
0,israelmazda,amitaiz,0
1,israelmazda,normis,0
2,israelmazda,heisebusiness,0
3,israelmazda,CIOevent,0
4,israelmazda,NetanelBenSimon,0
...,...,...,...
7988,boltapp,rudolfosman,1
7989,boltapp,avepihlak,1
7990,boltapp,Olljum,1
7991,boltapp,taxipal,1


In [244]:
# startup
startup_edge = pd.DataFrame(list(nx.non_edges(startup_lstsub))) 
startup_edge2 = pd.DataFrame(list(startup_lstsub.edges())) 

In [245]:
startup_edge['isedge?'] = 0
startup_edge2['isedge?'] = 1

In [246]:
startup_edge = pd.concat([startup_edge, startup_edge2], axis=0)
startup_edge

Unnamed: 0,0,1,isedge?
0,corverstag,TiuMc,0
1,corverstag,Ecroaker,0
2,corverstag,RuthCelinaMD,0
3,corverstag,mattyglesias,0
4,corverstag,irmolicr7,0
...,...,...,...
8188,Adcash,CasualConnect,1
8189,Adcash,NewZooHQ,1
8190,Adcash,comScore,1
8191,Adcash,24heuresdumans,1


In [247]:
# academic
academic_edge = pd.DataFrame(list(nx.non_edges(academic_lstsub)))
academic_edge2 = pd.DataFrame(list(academic_lstsub.edges()))

In [248]:
academic_edge['isedge?'] = 0
academic_edge2['isedge?'] = 1

In [249]:
academic_edge = pd.concat([academic_edge, academic_edge2], axis=0)
academic_edge

Unnamed: 0,0,1,isedge?
0,amitaiz,gideonamichay,0
1,amitaiz,tipikas,0
2,amitaiz,federalreserve,0
3,amitaiz,ACM_President,0
4,amitaiz,IsraelinSK,0
...,...,...,...
4282,TallinnTech,IgorFelc,1
4283,TallinnTech,amjadrikzan,1
4284,TallinnTech,neemekorv,1
4285,TallinnTech,foursquare,1


In [250]:
# civil
civil_edge = pd.DataFrame(list(nx.non_edges(civil_lstsub)))
civil_edge2 = pd.DataFrame(list(civil_lstsub.edges()))

In [251]:
civil_edge['isedge?'] = 0
civil_edge2['isedge?'] = 1

In [252]:
civil_edge = pd.concat([civil_edge, civil_edge2], axis=0)
civil_edge

Unnamed: 0,0,1,isedge?
0,amitaiz,rothem,0
1,amitaiz,RACISMinISRAEL,0
2,amitaiz,KIC_Madeleine,0
3,amitaiz,muliepstein,0
4,amitaiz,yoavshaham,0
...,...,...,...
2484,CleantechForEst,autodesk,1
2485,CleantechForEst,testlio,1
2486,CleantechForEst,TransferWise,1
2487,CleantechForEst,TeleportInc,1


In [253]:
# media
media_edge = pd.DataFrame(list(nx.non_edges(media_lstsub))) 
media_edge2 = pd.DataFrame(list(media_lstsub.edges())) 

In [254]:
media_edge['isedge?'] = 0
media_edge2['isedge?'] = 1

In [255]:
media_edge = pd.concat([media_edge, media_edge2], axis=0)
media_edge

Unnamed: 0,0,1,isedge?
0,corverstag,EmreHistory,0
1,corverstag,SapienAgenda,0
2,corverstag,jenkers_en,0
3,corverstag,MaimonidesQuote,0
4,corverstag,dt910189,0
...,...,...,...
2602,IsraelHayomEng,GMMivs,1
2603,IsraelHayomEng,SFJewishFilm,1
2604,IsraelHayomEng,USWNT,1
2605,IsraelHayomEng,standamericanow,1


#### datasets are balanced between edges and non-edges

Will train a classification algorithm based on the edge features to predict link

# 1. Common Neighbors

In [169]:
# taipei
taipei_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(taipei_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(taipei_lstsub))
taipei_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(taipei_lstsub, e[0], e[1])))) 
      for e in nx.edges(taipei_lstsub))

In [170]:
taipei_neighbors = pd.concat([taipei_neighbors, taipei_neighbors2], axis=0)
taipei_neighbors

Unnamed: 0,0,1,2
0,corverstag,BarryJenkins,0
1,corverstag,SolihinNizam,0
2,corverstag,carnett_jeff,1
3,corverstag,tjhanour,1
4,corverstag,halmarsh5,0
...,...,...,...
9807,SinicaFans,YitHengChooi,0
9808,SinicaFans,yitang_ucla,0
9809,SinicaFans,GranularLab,0
9810,SinicaFans,apcrs2018,0


In [171]:
# tel aviv
telaviv_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(telaviv_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(telaviv_lstsub))
telaviv_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(telaviv_lstsub, e[0], e[1])))) 
      for e in nx.edges(telaviv_lstsub))

In [172]:
telaviv_neighbors = pd.concat([telaviv_neighbors, telaviv_neighbors2], axis=0)
telaviv_neighbors

Unnamed: 0,0,1,2
0,rothem,irmolicr7,0
1,rothem,MarleeMatlin,0
2,rothem,ChrisBarrett,0
3,rothem,yoavshaham,1
4,rothem,Fredoev,0
...,...,...,...
23937,moovit,cluedont,0
23938,moovit,transantiago,0
23939,moovit,C_Fesen,0
23940,moovit,SophieB_94,0


In [173]:
# tallinn
tallinn_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(tallinn_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(tallinn_lstsub))
tallinn_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(tallinn_lstsub, e[0], e[1])))) 
      for e in nx.edges(tallinn_lstsub))

In [174]:
tallinn_neighbors = pd.concat([tallinn_neighbors, tallinn_neighbors2], axis=0)
tallinn_neighbors

Unnamed: 0,0,1,2
0,israelmazda,FilippovEU,0
1,israelmazda,ahtih,0
2,israelmazda,normis,1
3,israelmazda,KIC_Madeleine,0
4,israelmazda,GCCowgirl,0
...,...,...,...
8664,Adcash,CasualConnect,0
8665,Adcash,NewZooHQ,0
8666,Adcash,comScore,0
8667,Adcash,24heuresdumans,0


In [175]:
# public
public_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(public_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(public_lstsub))
public_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(public_lstsub, e[0], e[1])))) 
      for e in nx.edges(public_lstsub))

In [176]:
public_neighbors = pd.concat([public_neighbors, public_neighbors2], axis=0)
public_neighbors

Unnamed: 0,0,1,2
0,FilippovEU,Nutiteq,1
1,FilippovEU,ahtih,1
2,FilippovEU,SATUlaboratory,1
3,FilippovEU,TimHarcourt,0
4,FilippovEU,CREAcademy,1
...,...,...,...
2174,startupestonia,BalticStartup,0
2175,startupestonia,kuratcom,0
2176,startupestonia,aaltoes,0
2177,startupestonia,business_design,0


In [177]:
# corpo
corpo_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(corpo_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(corpo_lstsub))
corpo_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(corpo_lstsub, e[0], e[1])))) 
      for e in nx.edges(corpo_lstsub))

In [178]:
corpo_neighbors = pd.concat([corpo_neighbors, corpo_neighbors2], axis=0)
corpo_neighbors

Unnamed: 0,0,1,2
0,israelmazda,amitaiz,0
1,israelmazda,normis,1
2,israelmazda,heisebusiness,0
3,israelmazda,CIOevent,0
4,israelmazda,NetanelBenSimon,0
...,...,...,...
7988,boltapp,rudolfosman,0
7989,boltapp,avepihlak,0
7990,boltapp,Olljum,0
7991,boltapp,taxipal,0


In [179]:
# startup
startup_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(startup_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(startup_lstsub))
startup_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(startup_lstsub, e[0], e[1])))) 
      for e in nx.edges(startup_lstsub))

In [180]:
startup_neighbors = pd.concat([startup_neighbors, startup_neighbors2], axis=0)
startup_neighbors

Unnamed: 0,0,1,2
0,corverstag,TiuMc,1
1,corverstag,Ecroaker,0
2,corverstag,RuthCelinaMD,0
3,corverstag,mattyglesias,0
4,corverstag,irmolicr7,0
...,...,...,...
8188,Adcash,CasualConnect,0
8189,Adcash,NewZooHQ,0
8190,Adcash,comScore,0
8191,Adcash,24heuresdumans,0


In [181]:
# academic
academic_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(academic_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(academic_lstsub))
academic_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(academic_lstsub, e[0], e[1])))) 
      for e in nx.edges(academic_lstsub))

In [182]:
academic_neighbors = pd.concat([academic_neighbors, academic_neighbors2], axis=0)
academic_neighbors

Unnamed: 0,0,1,2
0,amitaiz,gideonamichay,1
1,amitaiz,tipikas,0
2,amitaiz,federalreserve,1
3,amitaiz,ACM_President,1
4,amitaiz,IsraelinSK,1
...,...,...,...
4282,TallinnTech,IgorFelc,0
4283,TallinnTech,amjadrikzan,0
4284,TallinnTech,neemekorv,0
4285,TallinnTech,foursquare,0


In [183]:
# civil
civil_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(civil_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(civil_lstsub))
civil_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(civil_lstsub, e[0], e[1])))) 
      for e in nx.edges(civil_lstsub))

In [184]:
civil_neighbors = pd.concat([civil_neighbors, civil_neighbors2], axis=0)
civil_neighbors

Unnamed: 0,0,1,2
0,amitaiz,rothem,1
1,amitaiz,RACISMinISRAEL,1
2,amitaiz,KIC_Madeleine,0
3,amitaiz,muliepstein,1
4,amitaiz,yoavshaham,1
...,...,...,...
2484,CleantechForEst,autodesk,0
2485,CleantechForEst,testlio,0
2486,CleantechForEst,TransferWise,0
2487,CleantechForEst,TeleportInc,0


In [185]:
# media
media_neighbors = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(media_lstsub, e[0], e[1])))) 
      for e in nx.non_edges(media_lstsub))
media_neighbors2 = pd.DataFrame((e[0], e[1], 
      len(list(nx.common_neighbors(media_lstsub, e[0], e[1])))) 
      for e in nx.edges(media_lstsub))

In [186]:
media_neighbors = pd.concat([media_neighbors, media_neighbors2], axis=0)
media_neighbors

Unnamed: 0,0,1,2
0,corverstag,EmreHistory,1
1,corverstag,SapienAgenda,1
2,corverstag,jenkers_en,1
3,corverstag,MaimonidesQuote,0
4,corverstag,dt910189,1
...,...,...,...
2602,IsraelHayomEng,GMMivs,0
2603,IsraelHayomEng,SFJewishFilm,0
2604,IsraelHayomEng,USWNT,0
2605,IsraelHayomEng,standamericanow,0


# 2. jaccard coefficient

In [167]:
# taipei
taipei_jaccard = pd.DataFrame(nx.jaccard_coefficient(taipei_lstsub))
taipei_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(taipei_lstsub, ebunch=taipei_lstsub.edges()))

In [168]:
taipei_jaccard = pd.concat([taipei_jaccard, taipei_jaccard2], axis=0)
taipei_jaccard

Unnamed: 0,0,1,2
0,corverstag,BarryJenkins,0.000000
1,corverstag,SolihinNizam,0.000000
2,corverstag,carnett_jeff,0.333333
3,corverstag,tjhanour,0.333333
4,corverstag,halmarsh5,0.000000
...,...,...,...
9807,SinicaFans,YitHengChooi,0.000000
9808,SinicaFans,yitang_ucla,0.000000
9809,SinicaFans,GranularLab,0.000000
9810,SinicaFans,apcrs2018,0.000000


In [112]:
# tel aviv
telaviv_jaccard = pd.DataFrame(nx.jaccard_coefficient(telaviv_lstsub))
telaviv_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(telaviv_lstsub, ebunch=telaviv_lstsub.edges()))

In [115]:
telaviv_jaccard = pd.concat([telaviv_jaccard, telaviv_jaccard2], axis=0)
telaviv_jaccard

Unnamed: 0,0,1,2
0,rothem,irmolicr7,0.0
1,rothem,MarleeMatlin,0.0
2,rothem,ChrisBarrett,0.0
3,rothem,yoavshaham,1.0
4,rothem,Fredoev,0.0
...,...,...,...
23937,moovit,cluedont,0.0
23938,moovit,transantiago,0.0
23939,moovit,C_Fesen,0.0
23940,moovit,SophieB_94,0.0


In [198]:
# tallinn
tallinn_jaccard = pd.DataFrame(nx.jaccard_coefficient(tallinn_lstsub))
tallinn_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(tallinn_lstsub, ebunch=tallinn_lstsub.edges()))

In [199]:
tallinn_jaccard = pd.concat([tallinn_jaccard, tallinn_jaccard2], axis=0)
tallinn_jaccard

Unnamed: 0,0,1,2
0,israelmazda,FilippovEU,0.0
1,israelmazda,ahtih,0.0
2,israelmazda,normis,1.0
3,israelmazda,KIC_Madeleine,0.0
4,israelmazda,GCCowgirl,0.0
...,...,...,...
8664,Adcash,CasualConnect,0.0
8665,Adcash,NewZooHQ,0.0
8666,Adcash,comScore,0.0
8667,Adcash,24heuresdumans,0.0


In [118]:
# public
public_jaccard = pd.DataFrame(nx.jaccard_coefficient(public_lstsub))
public_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(public_lstsub, ebunch=public_lstsub.edges()))

In [119]:
public_jaccard = pd.concat([public_jaccard, public_jaccard2], axis=0)
public_jaccard

Unnamed: 0,0,1,2
0,FilippovEU,Nutiteq,1.0
1,FilippovEU,ahtih,1.0
2,FilippovEU,SATUlaboratory,1.0
3,FilippovEU,TimHarcourt,0.0
4,FilippovEU,CREAcademy,1.0
...,...,...,...
2174,startupestonia,BalticStartup,0.0
2175,startupestonia,kuratcom,0.0
2176,startupestonia,aaltoes,0.0
2177,startupestonia,business_design,0.0


In [120]:
# corpo
corpo_jaccard = pd.DataFrame(nx.jaccard_coefficient(corpo_lstsub))
corpo_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(corpo_lstsub, ebunch=corpo_lstsub.edges()))

In [121]:
corpo_jaccard = pd.concat([corpo_jaccard, corpo_jaccard2], axis=0)
corpo_jaccard

Unnamed: 0,0,1,2
0,israelmazda,amitaiz,0.0
1,israelmazda,normis,1.0
2,israelmazda,heisebusiness,0.0
3,israelmazda,CIOevent,0.0
4,israelmazda,NetanelBenSimon,0.0
...,...,...,...
7988,boltapp,rudolfosman,0.0
7989,boltapp,avepihlak,0.0
7990,boltapp,Olljum,0.0
7991,boltapp,taxipal,0.0


In [122]:
# startup
startup_jaccard = pd.DataFrame(nx.jaccard_coefficient(startup_lstsub))
startup_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(startup_lstsub, ebunch=startup_lstsub.edges()))

In [123]:
startup_jaccard = pd.concat([startup_jaccard, startup_jaccard2], axis=0)
startup_jaccard

Unnamed: 0,0,1,2
0,corverstag,TiuMc,1.0
1,corverstag,Ecroaker,0.0
2,corverstag,RuthCelinaMD,0.0
3,corverstag,mattyglesias,0.0
4,corverstag,irmolicr7,0.0
...,...,...,...
8188,Adcash,CasualConnect,0.0
8189,Adcash,NewZooHQ,0.0
8190,Adcash,comScore,0.0
8191,Adcash,24heuresdumans,0.0


In [124]:
# academic
academic_jaccard = pd.DataFrame(nx.jaccard_coefficient(academic_lstsub))
academic_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(academic_lstsub, ebunch=academic_lstsub.edges()))

In [125]:
academic_jaccard = pd.concat([academic_jaccard, academic_jaccard2], axis=0)
academic_jaccard

Unnamed: 0,0,1,2
0,amitaiz,gideonamichay,1.0
1,amitaiz,tipikas,0.0
2,amitaiz,federalreserve,1.0
3,amitaiz,ACM_President,1.0
4,amitaiz,IsraelinSK,1.0
...,...,...,...
4282,TallinnTech,IgorFelc,0.0
4283,TallinnTech,amjadrikzan,0.0
4284,TallinnTech,neemekorv,0.0
4285,TallinnTech,foursquare,0.0


In [126]:
# civil
civil_jaccard = pd.DataFrame(nx.jaccard_coefficient(civil_lstsub))
civil_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(civil_lstsub, ebunch=civil_lstsub.edges()))

In [127]:
civil_jaccard = pd.concat([civil_jaccard, civil_jaccard2], axis=0)
civil_jaccard

Unnamed: 0,0,1,2
0,amitaiz,rothem,1.0
1,amitaiz,RACISMinISRAEL,1.0
2,amitaiz,KIC_Madeleine,0.0
3,amitaiz,muliepstein,1.0
4,amitaiz,yoavshaham,1.0
...,...,...,...
2484,CleantechForEst,autodesk,0.0
2485,CleantechForEst,testlio,0.0
2486,CleantechForEst,TransferWise,0.0
2487,CleantechForEst,TeleportInc,0.0


In [128]:
# media
media_jaccard = pd.DataFrame(nx.jaccard_coefficient(media_lstsub))
media_jaccard2 = pd.DataFrame(nx.jaccard_coefficient(media_lstsub, ebunch=media_lstsub.edges()))

In [129]:
media_jaccard = pd.concat([media_jaccard, media_jaccard2], axis=0)
media_jaccard

Unnamed: 0,0,1,2
0,corverstag,EmreHistory,1.0
1,corverstag,SapienAgenda,1.0
2,corverstag,jenkers_en,1.0
3,corverstag,MaimonidesQuote,0.0
4,corverstag,dt910189,1.0
...,...,...,...
2602,IsraelHayomEng,GMMivs,0.0
2603,IsraelHayomEng,SFJewishFilm,0.0
2604,IsraelHayomEng,USWNT,0.0
2605,IsraelHayomEng,standamericanow,0.0


# 3. resource allocation index

In [130]:
# taipei
taipei_resource = pd.DataFrame(nx.resource_allocation_index(taipei_lstsub))
taipei_resource2 = pd.DataFrame(nx.resource_allocation_index(taipei_lstsub, ebunch=taipei_lstsub.edges()))

In [131]:
taipei_resource = pd.concat([taipei_resource, taipei_resource2], axis=0)
taipei_resource

Unnamed: 0,0,1,2
0,corverstag,BarryJenkins,0.000000
1,corverstag,SolihinNizam,0.000000
2,corverstag,carnett_jeff,0.000560
3,corverstag,tjhanour,0.000471
4,corverstag,halmarsh5,0.000000
...,...,...,...
9807,SinicaFans,YitHengChooi,0.000000
9808,SinicaFans,yitang_ucla,0.000000
9809,SinicaFans,GranularLab,0.000000
9810,SinicaFans,apcrs2018,0.000000


In [132]:
# telaviv
telaviv_resource = pd.DataFrame(nx.resource_allocation_index(telaviv_lstsub))
telaviv_resource2 = pd.DataFrame(nx.resource_allocation_index(telaviv_lstsub, ebunch=telaviv_lstsub.edges()))

In [133]:
telaviv_resource = pd.concat([telaviv_resource, telaviv_resource2], axis=0)
telaviv_resource

Unnamed: 0,0,1,2
0,rothem,irmolicr7,0.000000
1,rothem,MarleeMatlin,0.000000
2,rothem,ChrisBarrett,0.000000
3,rothem,yoavshaham,0.000554
4,rothem,Fredoev,0.000000
...,...,...,...
23937,moovit,cluedont,0.000000
23938,moovit,transantiago,0.000000
23939,moovit,C_Fesen,0.000000
23940,moovit,SophieB_94,0.000000


In [134]:
# tallinn
tallinn_resource = pd.DataFrame(nx.resource_allocation_index(tallinn_lstsub))
tallinn_resource2 = pd.DataFrame(nx.resource_allocation_index(tallinn_lstsub, ebunch=tallinn_lstsub.edges()))

In [135]:
tallinn_resource = pd.concat([tallinn_resource, tallinn_resource2], axis=0)
tallinn_resource

Unnamed: 0,0,1,2
0,israelmazda,FilippovEU,0.000000
1,israelmazda,ahtih,0.000000
2,israelmazda,normis,0.000393
3,israelmazda,KIC_Madeleine,0.000000
4,israelmazda,GCCowgirl,0.000000
...,...,...,...
8664,Adcash,CasualConnect,0.000000
8665,Adcash,NewZooHQ,0.000000
8666,Adcash,comScore,0.000000
8667,Adcash,24heuresdumans,0.000000


In [136]:
# public
public_resource = pd.DataFrame(nx.resource_allocation_index(public_lstsub))
public_resource2 = pd.DataFrame(nx.resource_allocation_index(public_lstsub, ebunch=public_lstsub.edges()))

In [137]:
public_resource = pd.concat([public_resource, public_resource2], axis=0)
public_resource

Unnamed: 0,0,1,2
0,FilippovEU,Nutiteq,0.000572
1,FilippovEU,ahtih,0.000572
2,FilippovEU,SATUlaboratory,0.000572
3,FilippovEU,TimHarcourt,0.000000
4,FilippovEU,CREAcademy,0.000572
...,...,...,...
2174,startupestonia,BalticStartup,0.000000
2175,startupestonia,kuratcom,0.000000
2176,startupestonia,aaltoes,0.000000
2177,startupestonia,business_design,0.000000


In [138]:
# corpo
corpo_resource = pd.DataFrame(nx.resource_allocation_index(corpo_lstsub))
corpo_resource2 = pd.DataFrame(nx.resource_allocation_index(corpo_lstsub, ebunch=corpo_lstsub.edges()))

In [139]:
corpo_resource = pd.concat([corpo_resource, corpo_resource2], axis=0)
corpo_resource

Unnamed: 0,0,1,2
0,israelmazda,amitaiz,0.000000
1,israelmazda,normis,0.000393
2,israelmazda,heisebusiness,0.000000
3,israelmazda,CIOevent,0.000000
4,israelmazda,NetanelBenSimon,0.000000
...,...,...,...
7988,boltapp,rudolfosman,0.000000
7989,boltapp,avepihlak,0.000000
7990,boltapp,Olljum,0.000000
7991,boltapp,taxipal,0.000000


In [140]:
# startup
startup_resource = pd.DataFrame(nx.resource_allocation_index(startup_lstsub))
startup_resource2 = pd.DataFrame(nx.resource_allocation_index(startup_lstsub, ebunch=startup_lstsub.edges()))

In [141]:
startup_resource = pd.concat([startup_resource, startup_resource2], axis=0)
startup_resource

Unnamed: 0,0,1,2
0,corverstag,TiuMc,0.000471
1,corverstag,Ecroaker,0.000000
2,corverstag,RuthCelinaMD,0.000000
3,corverstag,mattyglesias,0.000000
4,corverstag,irmolicr7,0.000000
...,...,...,...
8188,Adcash,CasualConnect,0.000000
8189,Adcash,NewZooHQ,0.000000
8190,Adcash,comScore,0.000000
8191,Adcash,24heuresdumans,0.000000


In [142]:
# academic
academic_resource = pd.DataFrame(nx.resource_allocation_index(academic_lstsub))
academic_resource2 = pd.DataFrame(nx.resource_allocation_index(academic_lstsub, ebunch=academic_lstsub.edges()))

In [143]:
academic_resource = pd.concat([academic_resource, academic_resource2], axis=0)
academic_resource

Unnamed: 0,0,1,2
0,amitaiz,gideonamichay,0.000275
1,amitaiz,tipikas,0.000000
2,amitaiz,federalreserve,0.000275
3,amitaiz,ACM_President,0.000275
4,amitaiz,IsraelinSK,0.000275
...,...,...,...
4282,TallinnTech,IgorFelc,0.000000
4283,TallinnTech,amjadrikzan,0.000000
4284,TallinnTech,neemekorv,0.000000
4285,TallinnTech,foursquare,0.000000


In [144]:
# civil
civil_resource = pd.DataFrame(nx.resource_allocation_index(civil_lstsub))
civil_resource2 = pd.DataFrame(nx.resource_allocation_index(civil_lstsub, ebunch=civil_lstsub.edges()))

In [145]:
civil_resource = pd.concat([civil_resource, civil_resource2], axis=0)
civil_resource

Unnamed: 0,0,1,2
0,amitaiz,rothem,0.000554
1,amitaiz,RACISMinISRAEL,0.000554
2,amitaiz,KIC_Madeleine,0.000000
3,amitaiz,muliepstein,0.000554
4,amitaiz,yoavshaham,0.000554
...,...,...,...
2484,CleantechForEst,autodesk,0.000000
2485,CleantechForEst,testlio,0.000000
2486,CleantechForEst,TransferWise,0.000000
2487,CleantechForEst,TeleportInc,0.000000


In [146]:
# media
media_resource = pd.DataFrame(nx.resource_allocation_index(media_lstsub))
media_resource2 = pd.DataFrame(nx.resource_allocation_index(media_lstsub, ebunch=media_lstsub.edges()))

In [147]:
media_resource = pd.concat([media_resource, media_resource2], axis=0)
media_resource

Unnamed: 0,0,1,2
0,corverstag,EmreHistory,0.000561
1,corverstag,SapienAgenda,0.000561
2,corverstag,jenkers_en,0.000561
3,corverstag,MaimonidesQuote,0.000000
4,corverstag,dt910189,0.000561
...,...,...,...
2602,IsraelHayomEng,GMMivs,0.000000
2603,IsraelHayomEng,SFJewishFilm,0.000000
2604,IsraelHayomEng,USWNT,0.000000
2605,IsraelHayomEng,standamericanow,0.000000


# 4. preferential attachment

In [148]:
# taipei
taipei_preferential = pd.DataFrame(nx.preferential_attachment(taipei_lstsub))
taipei_preferential2 = pd.DataFrame(nx.preferential_attachment(taipei_lstsub, ebunch=taipei_lstsub.edges()))

In [149]:
taipei_preferential = pd.concat([taipei_preferential, taipei_preferential2], axis=0)
taipei_preferential

Unnamed: 0,0,1,2
0,corverstag,BarryJenkins,3
1,corverstag,SolihinNizam,3
2,corverstag,carnett_jeff,3
3,corverstag,tjhanour,3
4,corverstag,halmarsh5,3
...,...,...,...
9807,SinicaFans,YitHengChooi,312
9808,SinicaFans,yitang_ucla,312
9809,SinicaFans,GranularLab,312
9810,SinicaFans,apcrs2018,312


In [150]:
# telaviv
telaviv_preferential = pd.DataFrame(nx.preferential_attachment(telaviv_lstsub))
telaviv_preferential2 = pd.DataFrame(nx.preferential_attachment(telaviv_lstsub, ebunch=telaviv_lstsub.edges()))

In [151]:
telaviv_preferential = pd.concat([telaviv_preferential, telaviv_preferential2], axis=0)
telaviv_preferential

Unnamed: 0,0,1,2
0,rothem,irmolicr7,1
1,rothem,MarleeMatlin,1
2,rothem,ChrisBarrett,1
3,rothem,yoavshaham,1
4,rothem,Fredoev,1
...,...,...,...
23937,moovit,cluedont,5567
23938,moovit,transantiago,5567
23939,moovit,C_Fesen,5567
23940,moovit,SophieB_94,5567


In [153]:
# tallinn
tallinn_preferential = pd.DataFrame(nx.preferential_attachment(tallinn_lstsub))
tallinn_preferential2 = pd.DataFrame(nx.preferential_attachment(tallinn_lstsub, ebunch=tallinn_lstsub.edges()))

In [154]:
tallinn_preferential = pd.concat([tallinn_preferential, tallinn_preferential2], axis=0)
tallinn_preferential

Unnamed: 0,0,1,2
0,israelmazda,FilippovEU,1
1,israelmazda,ahtih,1
2,israelmazda,normis,1
3,israelmazda,KIC_Madeleine,1
4,israelmazda,GCCowgirl,1
...,...,...,...
8664,Adcash,CasualConnect,506
8665,Adcash,NewZooHQ,506
8666,Adcash,comScore,506
8667,Adcash,24heuresdumans,506


In [155]:
# public
public_preferential = pd.DataFrame(nx.preferential_attachment(public_lstsub))
public_preferential2 = pd.DataFrame(nx.preferential_attachment(public_lstsub, ebunch=public_lstsub.edges()))

In [156]:
public_preferential = pd.concat([public_preferential, public_preferential2], axis=0)
public_preferential

Unnamed: 0,0,1,2
0,FilippovEU,Nutiteq,1
1,FilippovEU,ahtih,1
2,FilippovEU,SATUlaboratory,1
3,FilippovEU,TimHarcourt,1
4,FilippovEU,CREAcademy,1
...,...,...,...
2174,startupestonia,BalticStartup,1748
2175,startupestonia,kuratcom,1748
2176,startupestonia,aaltoes,1748
2177,startupestonia,business_design,1748


In [157]:
# corpo
corpo_preferential = pd.DataFrame(nx.preferential_attachment(corpo_lstsub))
corpo_preferential2 = pd.DataFrame(nx.preferential_attachment(corpo_lstsub, ebunch=corpo_lstsub.edges()))

In [158]:
corpo_preferential = pd.concat([corpo_preferential, corpo_preferential2], axis=0)
corpo_preferential

Unnamed: 0,0,1,2
0,israelmazda,amitaiz,1
1,israelmazda,normis,1
2,israelmazda,heisebusiness,1
3,israelmazda,CIOevent,1
4,israelmazda,NetanelBenSimon,1
...,...,...,...
7988,boltapp,rudolfosman,2542
7989,boltapp,avepihlak,2542
7990,boltapp,Olljum,2542
7991,boltapp,taxipal,2542


In [159]:
# startup
startup_preferential = pd.DataFrame(nx.preferential_attachment(startup_lstsub))
startup_preferential2 = pd.DataFrame(nx.preferential_attachment(startup_lstsub, ebunch=startup_lstsub.edges()))

In [160]:
startup_preferential = pd.concat([startup_preferential, startup_preferential2], axis=0)
startup_preferential

Unnamed: 0,0,1,2
0,corverstag,TiuMc,1
1,corverstag,Ecroaker,1
2,corverstag,RuthCelinaMD,1
3,corverstag,mattyglesias,1
4,corverstag,irmolicr7,1
...,...,...,...
8188,Adcash,CasualConnect,506
8189,Adcash,NewZooHQ,506
8190,Adcash,comScore,506
8191,Adcash,24heuresdumans,506


In [161]:
# academic
academic_preferential = pd.DataFrame(nx.preferential_attachment(academic_lstsub))
academic_preferential2 = pd.DataFrame(nx.preferential_attachment(academic_lstsub, ebunch=academic_lstsub.edges()))

In [162]:
academic_preferential = pd.concat([academic_preferential, academic_preferential2], axis=0)
academic_preferential

Unnamed: 0,0,1,2
0,amitaiz,gideonamichay,1
1,amitaiz,tipikas,1
2,amitaiz,federalreserve,1
3,amitaiz,ACM_President,1
4,amitaiz,IsraelinSK,1
...,...,...,...
4282,TallinnTech,IgorFelc,347
4283,TallinnTech,amjadrikzan,347
4284,TallinnTech,neemekorv,347
4285,TallinnTech,foursquare,347


In [163]:
# civil
civil_preferential = pd.DataFrame(nx.preferential_attachment(civil_lstsub))
civil_preferential2 = pd.DataFrame(nx.preferential_attachment(civil_lstsub, ebunch=civil_lstsub.edges()))

In [164]:
civil_preferential = pd.concat([civil_preferential, civil_preferential2], axis=0)
civil_preferential

Unnamed: 0,0,1,2
0,amitaiz,rothem,1
1,amitaiz,RACISMinISRAEL,1
2,amitaiz,KIC_Madeleine,1
3,amitaiz,muliepstein,1
4,amitaiz,yoavshaham,1
...,...,...,...
2484,CleantechForEst,autodesk,577
2485,CleantechForEst,testlio,577
2486,CleantechForEst,TransferWise,577
2487,CleantechForEst,TeleportInc,577


In [165]:
# media
media_preferential = pd.DataFrame(nx.preferential_attachment(media_lstsub))
media_preferential2 = pd.DataFrame(nx.preferential_attachment(media_lstsub, ebunch=media_lstsub.edges()))

In [166]:
media_preferential = pd.concat([media_preferential, media_preferential2], axis=0)
media_preferential

Unnamed: 0,0,1,2
0,corverstag,EmreHistory,1
1,corverstag,SapienAgenda,1
2,corverstag,jenkers_en,1
3,corverstag,MaimonidesQuote,1
4,corverstag,dt910189,1
...,...,...,...
2602,IsraelHayomEng,GMMivs,825
2603,IsraelHayomEng,SFJewishFilm,825
2604,IsraelHayomEng,USWNT,825
2605,IsraelHayomEng,standamericanow,825


# merge dataframes

In [187]:
# taipei
taipei_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [188]:
taipei_neighbors['jaccard_coef'] = taipei_jaccard[2]

In [189]:
taipei_neighbors['ressource_alloc'] = taipei_resource[2]

In [190]:
taipei_neighbors['pref_attachment'] = taipei_preferential[2]

In [191]:
# telaviv
telaviv_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [192]:
telaviv_neighbors['jaccard_coef'] = telaviv_jaccard[2]

In [193]:
telaviv_neighbors['ressource_alloc'] = telaviv_resource[2]

In [194]:
telaviv_neighbors['pref_attachment'] = telaviv_preferential[2]

In [195]:
# tallinn
tallinn_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [200]:
tallinn_neighbors['jaccard_coef'] = tallinn_jaccard[2]

In [201]:
tallinn_neighbors['ressource_alloc'] = tallinn_resource[2]

In [202]:
tallinn_neighbors['pref_attachment'] = tallinn_preferential[2]

In [203]:
# public
public_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [204]:
public_neighbors['jaccard_coef'] = public_jaccard[2]

In [205]:
public_neighbors['ressource_alloc'] = public_resource[2]

In [206]:
public_neighbors['pref_attachment'] = public_preferential[2]

In [207]:
# corpo
corpo_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [208]:
corpo_neighbors['jaccard_coef'] = corpo_jaccard[2]

In [209]:
corpo_neighbors['ressource_alloc'] = corpo_resource[2]

In [210]:
corpo_neighbors['pref_attachment'] = corpo_preferential[2]

In [211]:
# startup
startup_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [212]:
startup_neighbors['jaccard_coef'] = startup_jaccard[2]

In [213]:
startup_neighbors['ressource_alloc'] = startup_resource[2]

In [214]:
startup_neighbors['pref_attachment'] = startup_preferential[2]

In [215]:
# academic
academic_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [216]:
academic_neighbors['jaccard_coef'] = academic_jaccard[2]

In [217]:
academic_neighbors['ressource_alloc'] = academic_resource[2]

In [218]:
academic_neighbors['pref_attachment'] = academic_preferential[2]

In [219]:
# civil
civil_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [220]:
civil_neighbors['jaccard_coef'] = civil_jaccard[2]

In [221]:
civil_neighbors['ressource_alloc'] = civil_resource[2]

In [222]:
civil_neighbors['pref_attachment'] = civil_preferential[2]

In [223]:
# media
media_neighbors.rename(columns={0:'node',
                                 1:'target',
                                 2:'common_neigh'},
                       inplace=True)

In [224]:
media_neighbors['jaccard_coef'] = media_jaccard[2]

In [225]:
media_neighbors['ressource_alloc'] = media_resource[2]

In [226]:
media_neighbors['pref_attachment'] = media_preferential[2]

In [256]:
taipei_neighbors['isedge?'] = taipei_edge['isedge?']

In [257]:
telaviv_neighbors['isedge?'] = telaviv_edge['isedge?']

In [258]:
tallinn_neighbors['isedge?'] = tallinn_edge['isedge?']

In [259]:
public_neighbors['isedge?'] = public_edge['isedge?']

In [260]:
corpo_neighbors['isedge?'] = corpo_edge['isedge?']

In [261]:
startup_neighbors['isedge?'] = startup_edge['isedge?']

In [262]:
academic_neighbors['isedge?'] = academic_edge['isedge?']

In [263]:
civil_neighbors['isedge?'] = civil_edge['isedge?']

In [264]:
media_neighbors['isedge?'] = media_edge['isedge?']