In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast
import random
import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.WRatio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=0):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=0, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
7266,1520474768,eda yucesoy,eda_yucesoy,en,Eda Yucesoy,91,11,9
18106,626391676,Esma_K,prenses_86,tr,Svs -,60,23,69
9306,414895747,Duygu Mira,sensordiye,en-gb,! SEP,60,260,690
1229,948142276685062145,mukaddes,mukadde46455362,en,Kadem,72,0,93
12696,3081688500,–Ω√º—ï—î—ÉiŒ∑ √∂z—Ç√º—è–∫,hozturk534,tr,Beytullah Ozturk,70,963,144


In [5]:
no_sehir = lambda x:"sehir" in clean(x)

In [6]:
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna().set_index("id")
#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7.735326e+17,Utkan Kutlu,utkan_kutlu,sv,Furkan Kulcu,70.0,95.0,3288.0
172274300.0,Fatih K√º√ß√ºk,fkucuktwt,en,Mehmet Akif Kucuk,68.0,93.0,296.0
532616700.0,MEHMET AKƒ∞F ERGƒ∞N,makif85,tr,M.Akif Kayapinar,64.0,65.0,602.0
9.340795e+17,Selahattin Er,SelahattinEr12,tr,Selahattin Husmen,71.0,4.0,76.0
9.406698e+17,Sinanuluda≈ü,snanuludas,tr,Sena Nas,67.0,2.0,84.0


In [7]:
twitter_users["is_org"] = twitter_users.name.apply(no_sehir)
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8.949316e+17,Kevin,haykevinn,tr,Suedanur Haykiran,64.0,12.0,85.0,False
319081100.0,Taha Erdoƒüan,erdogan_taha,tr,Mustafa Erdogan,70.0,124.0,119.0,False
8.54081e+17,MustafaKemal,AtamOlmedi,tr,! Some,68.0,568.0,1039.0,False
6.954063e+17,Baran fazlƒ±oglu,baran_fazl,tr,Baran Altun,67.0,112.0,1106.0,False
9.48534e+17,Haydar,Haydar79376817,en,! DMARC,54.0,0.0,83.0,False


In [8]:
sehir_orgs = twitter_users[twitter_users.is_org==True].drop(labels=["is_org"], axis=1)
sehir_users = twitter_users[twitter_users.is_org==False].drop(labels=["is_org"], axis=1)

In [9]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
704118800.0,≈ûehir Binicilik,sehirbnclk,tr,Sehir Dance,67.0,18.0,5.0
3826578000.0,≈ûehirSavunmaSporlarƒ±,sehirunimartial,en,Sehir Mun,70.0,1.0,2.0
9.423668e+17,sehirdekiilkel,sehirdekiilkel,tr,Sehir Destek,62.0,58.0,154.0
7.810512e+17,≈ûehir Spor Merkezi,SehirUniSpor,tr,! Sehir Spor,82.0,113.0,4.0
8.569113e+17,Sehir WIB Club,sehirwibclub,tr,! Clubs,80.0,61.0,140.0


In [10]:
sehir_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
600201700.0,selahattin,_selahattin,tr,Selahattin Gultekin,82.0,114.0,899.0
8.480759e+17,ŸÖÿµÿ∑ŸÅŸâ ÿπŸÑŸä,Mustasfaali,tr,Mustafa Bal,82.0,0.0,286.0
7.434051e+17,duvaR,gazeteduvar,tr,Gamze Basturk,58.0,145605.0,38.0
3354128000.0,M. Said Alkƒ±≈ü,msaidalkis,en,Adnan Alkis,67.0,74.0,551.0
31880420.0,Engin Esen,EnginEsen,tr,Engin Deniz Akarli,70.0,1858.0,2384.0


In [11]:
truncate = lambda x: int(str(int(x))[:9])

In [12]:
resetted = twitter_users.reset_index()
index = resetted.id.apply(truncate)
resetted.rename(columns={"id":"tw_id"}, inplace=True)
resetted.index = index

twitter_users = resetted
twitter_users.sample(5)

Unnamed: 0_level_0,tw_id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
157759898,157759900.0,nilgun,fndgleskici,tr,Nagehan Sakizci,62.0,114.0,607.0,False
320006352,3200064000.0,≈ûiirdeki Hayat,CefakarMm,tr,Efe Akar,71.0,107.0,389.0,False
982161549,9.821615e+17,Cavit Oktay Elvan,ElvanCavit,tr,Elvan Cati,80.0,5.0,22.0,False
740839141,7.408391e+17,Elif Ertem,ElifErtem12,tr,Elif Erdem,76.0,9.0,48.0,False
167506940,1675069000.0,AdliTƒ±pAdliBilimler,TMdanismanlik,tr,Osman Senlik,64.0,233.0,1468.0,False


In [13]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [14]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [15]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    twitter_users[int(i*(len(twitter_users)/8)):int((i+1)*(len(twitter_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)





took:  1959.82204246521


In [16]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

There are 20674 matches


Unnamed: 0,match_name,screen_name
11836,"[( ismail kara, 90)]",ismail63206146
12481,"[( global, 67)]",Gzllba
18973,"[(ihsan f. i. albittar albittar, 86)]",ismaillugur
12728,"[( global, 90)]",urbanhist
20494,"[(ieee kulubu, 86)]",gtuieee


In [17]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
6500,fatma neslihan tutuncu,FDurgungoz,86
7351,deniz haj abrahim,muhammathamza,86
18454,muge akarsu,bngisuab,90
16646,edanur saluk,eaudeluna,82
9375,eda kurt,Emineedahasta,90


In [18]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [19]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
index = twitter_users.tw_id.apply(truncate)
twitter_users["id"] = index
twitter_users.set_index("id", inplace=True)
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
345917380,mehmet baran85,hyrlisibeglm,67,345917400.0,EMBaran,tr,1410.0,211.0,False
95451420,rukiye ozturk,yesimgizer,68,95451420.0,ye≈üo,tr,473.0,1637.0,False
748087265,zulal icoz,ilci_zulal,74,7.480873e+17,Z√ºlal Ilci,en,9.0,61.0,False
388385729,fatma eslem akbiyik,eslemuzunkaya,86,388385700.0,eslem yƒ±ldƒ±z,tr,8.0,245.0,False
294990622,yasemin atagul,YSMNSLK,90,294990600.0,yasemin,tr,162.0,605.0,False


In [20]:
twitter_users.to_csv("../datasets/twitter_users.csv", index_label="id")

In [15]:
twitter_users = pd.read_csv("../datasets/twitter_users.csv", index_col="id")
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
256178973,busra ozzengin,busraozzngn,100,2561790000.0,Busra Ozzengin,tr,26.0,34.0,False
963898159,ahmet seba,ahmetSBA,100,9.638982e+17,Ahmet SebA,en,48.0,432.0,False
736098205,hasan esenkoylu,HasanEsenkoylu,93,7.360982e+17,Hasan ESENK√ñYL√ú,tr,2.0,72.0,False
133943907,yunus demir,AvYunusDemirBey,100,1339439000.0,Yunus Demir,tr,95.0,151.0,False
141622845,merve duman,merveduman294,100,1416228000.0,merve duman,tr,3.0,16.0,False


In [16]:
twitter_users.loc[291122559]

match_name            ammar rasid
screen_name        AmmarRashed_MB
match_ratio                    87
tw_id                 2.91123e+08
name                 Ammar Rashed
lang                           en
followers_count               385
friends_count                 335
is_org                      False
Name: 291122559, dtype: object

In [17]:
filtered_twu = twitter_users[twitter_users.match_ratio>90]
filtered_twu = filtered_twu.append(twitter_users.loc[291122559])
filtered_twu.to_csv("../datasets/twitter_users.csv", index_label="id")

In [18]:
len(twitter_users), len(filtered_twu)

(1597, 1597)

In [19]:
filtered_twu[filtered_twu.is_org==True]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
111082356,sehir sehir,sehirlibrary,95,1110824000.0,Sehir Library,tr,911.0,43.0,True
106086098,√•¬ûehir √£¬úniversitesi,SehirUniversite,94,106086100.0,≈ûEHƒ∞R √úniversitesi,tr,12195.0,166.0,True
819269230,sehir sehir,sehir_alumni,95,8.192692e+17,Sehir Alumni,en,275.0,5.0,True
841682908,! sehir kariyer fest,sehircareerfest,94,8.416829e+17,≈ûehir Kariyer Fest,tr,241.0,220.0,True
847741204,sehir cycling club,SehirCycling,94,8.477412e+17,≈ûehir Cycling Club,tr,123.0,92.0,True
174415744,sehir sehir,sehiredebiyat,95,174415700.0,sehir edebiyat,en,2070.0,287.0,True
856911339,sehir sehir,sehirwibclub,95,8.569113e+17,Sehir WIB Club,tr,61.0,140.0,True
803581588,sehir dance club,SehirDance,100,8.035816e+17,Sehir Dance Club,tr,38.0,219.0,True
283216865,√•¬ûehir meydan√§¬±,sehirmeydani,92,2832169000.0,≈ûehir Meydanƒ±,tr,66.0,41.0,True
271718939,√•¬ûehir √£¬úniversitesi,SehirDedikodu,97,2717189000.0,≈ûehir √úniversitesi,tr,150.0,789.0,True


In [20]:
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
141108970,aynur yilmaz,aynuryl66097645,92,1411090000.0,aynur yƒ±lmaz,tr,12.0,18.0,False
338613789,elif sezer,el_sezer,100,3386138000.0,Elif Sezer,en,56.0,60.0,False
286270314,zeynep demir,zeyy_demir,100,2862703000.0,Zeynep Demir,tr,170.0,119.0,False
350791386,ceylin yagmur can,ceylinyagmurcan,94,350791400.0,Ceylin Yaƒümur Can,tr,344.0,89.0,False
867510112,ahmet mursel dogan,AhmetMrselDGN42,91,8.675101e+17,Ahmet M√ºrsel DOƒûAN,tr,5.0,46.0,False


In [20]:
filtered_twu[filtered_twu.screen_name.str.contains("kral")]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
14668733,ahmet bulut,kral,100,14668733.0,Ahmet Bulut,en,243.0,62.0,False


# Random Connections

In [21]:
from datetime import datetime

In [22]:
def present_in_date(changes_dates, queried_date):
    """
    changes_dates = {d1:True, d2:False, d3:True} connection added or removed
    """
    if changes_dates:
        str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
        changes = sorted(changes_dates,key=lambda d: str2date(d))
        queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
        present = False
        for d in changes:
            if queried_date < str2date(d):
                break
            present = changes_dates[d]
        return present
    else:
        return False

In [23]:
def connections_dict2df(cons):
    d = {i:[k[0],k[1], cons[k]] for i,k in enumerate(cons)}
    df = pd.DataFrame.from_dict(d, orient='index')
    df.columns=["from_user_id","to_user_id","formation"]
    return df

In [25]:
def generate_cons_dict(nodes, edges_per_date=None, portion=0.5, dates=["2018.05.%02d"%x for x in range(1,31)], remove_edge_prob=0.5):
    dates = sorted(dates)
    start = time.time()
    if edges_per_date is None:
        edges_per_date = int(len(filtered_twu)*portion)
        
    random_connections = dict()  # {(from, to):{date1:True, date2:False, ...}}
    connection_indices = {}
    
    for d in dates:
        for i in range(edges_per_date):
            from_ = nodes.sample(1).index[0]
            to = nodes.sample(1).index[0]
            if from_!=to:
                random_connections.setdefault((from_, to), {})
                das = random_connections[(from_, to)]
                pres = present_in_date(das, d)
                if pres:
                    if np.random.random()>remove_edge_prob:
                        das[d] = False
                else:
                    das[d] = True
                random_connections[(from_, to)] = das
    
    print("took: ",time.time()-start)
    return random_connections

In [26]:
cons_dict = generate_cons_dict(filtered_twu, portion=1)

took:  29.168699026107788


In [27]:
len(cons_dict)

47392

In [28]:
start = time.time()
random_connections = connections_dict2df(cons_dict)
print("took:", time.time()-start)
random_connections.to_csv("../datasets/random_connections.csv")
random_connections.head(5)

took: 0.23282909393310547


Unnamed: 0,from_user_id,to_user_id,formation
0,874031650,736998781,{'2018.05.18': True}
1,625979067,634113441,{'2018.05.12': True}
2,353008893,328899293,{'2018.05.06': True}
3,79819842,201160937,{'2018.05.13': True}
4,162427198,236914275,{'2018.05.16': True}


In [31]:
random_connections.loc[266]

from_user_id                                    274074220
to_user_id                                       88899411
formation       {'2018.05.05': True, '2018.05.18': False}
Name: 266, dtype: object

## Construct the network

In [32]:
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
7462,2968756413,106086098,{'2018.05.08': True}
11423,77451159,106086098,{'2018.05.08': True}
11330,551437764,106086098,{'2018.05.08': True}
39753,3120550907,2360133031,{'2018.05.08': True}
8820,1022461759,106086098,{'2018.05.08': True}


In [33]:
random_connections["first_date"] = random_connections.formation.apply(
    lambda dates: present_in_date(dates, "2018.05.01"))
random_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation,first_date
3892,952632696,852552092,{'2018.05.29': True},False
41511,540087524,784689086,{'2018.05.25': True},False
29090,928367738,293976465,{'2018.05.17': True},False
25072,758889025,271521207,{'2018.05.19': True},False
45353,274892363,782227273,{'2018.05.21': True},False


In [34]:
G = nx.DiGraph()
# for _, row in user_connections.iterrows():
for _, row in random_connections[random_connections.first_date==True].iterrows():    
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in filtered_twu.index and to in filtered_twu.index:
        G.add_edge(from_, to)

In [35]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twitter_users.loc[node]
    for aug in augs:
        if type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [36]:
len(G.nodes())

1389

In [37]:
len(G.edges())

1597

In [38]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [39]:
evc = nx.eigenvector_centrality(G, max_iter=500)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)

In [40]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank}

In [41]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [42]:
G.nodes[754361184]

{'betweenness': 0.0,
 'closeness_centrality': 0.0028818443804034585,
 'degree': 2,
 'eigenvector_centrality': 2.2940130705341977e-96,
 'followers_count': 0.0,
 'friends_count': 53.0,
 'in_degree': 2,
 'lang': '',
 'match_name': 'merve yilmaz',
 'name': 'merve ylmaz',
 'out_degree': 0,
 'pagerank': 0.0016856160998238972,
 'parity': 1,
 'screen_name': 'mrv_ylmaz_'}

In [43]:
list(G.nodes(data=True))[0]

(825982985,
 {'betweenness': 3.27246207580061e-05,
  'closeness_centrality': 0.0038812718927992702,
  'degree': 3,
  'eigenvector_centrality': 4.4930069967025043e-85,
  'followers_count': 195.0,
  'friends_count': 184.0,
  'in_degree': 2,
  'lang': '',
  'match_name': 'zeliha yuksel',
  'name': 'zeliha yuksel',
  'out_degree': 1,
  'pagerank': 0.0007026104872075536,
  'parity': 0,
  'screen_name': 'zelisyksl6153'})

In [44]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [45]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum(
        [(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum(
        [int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    cross_metric_ratio = cross_edges/float(len(nw.edges()))
    print("cross-metric edges ratio: ", cross_metric_ratio)
    print("Heterogeneity Fraction Norm", heterogeneity_fraction_norm)
    return cross_metric_ratio < heterogeneity_fraction_norm

In [46]:
homophily(G)

cross-metric edges ratio:  0.007514088916718848
Heterogeneity Fraction Norm 0.0071735081927786215


False