In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast

import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.token_sort_ratio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=0):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=0, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
6945,821611340,Şeyma Şahin,seymaasahinn,tr,Seyma Sahin,87,190,188
5311,1633657506,Beyz4 Özdemir,BeyzaOzd,tr,Beyza Goz,82,115,194
9291,1638050228,atılım dersaneleri,atlmdersaneleri,tr,Esra Ileri,63,10,15
1448,881386223029407749,Mustafa Metin,MstfMetin42,tr,Ahmet Metin,64,644,2063
12626,19529403,Necati Özkan,NecatiOzkan,tr,Hatice Ozkan,70,8772,1789


In [5]:
no_sehir = lambda x:"sehir" in clean(x)

In [6]:
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna().set_index("id")
#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3182932000.0,Marul.com,marulcomtr,tr,Ismail Comert,61.0,9006.0,2693.0
2347433000.0,özgürcee,izindeyizzz,tr,Fatih Celalettin Deniz,54.0,2.0,37.0
381177700.0,Haşmet Babaoğlu,HasmetBABA,tr,Samet Akbaba,73.0,179361.0,1248.0
2234937000.0,Mustafa Şenel,MusenelSENEL,tr,Merve Senel,70.0,11.0,18.0
2179560000.0,ernekalan,ernekalan,tr,Enes Kaplan,80.0,85.0,327.0


In [7]:
twitter_users["is_org"] = twitter_users.name.apply(no_sehir)
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
54954460.0,Melih Cılga,melihcilga,en,Melihcan Dodurgali,67.0,478.0,300.0,False
1944557000.0,nevin dermanlı,nevindermanl,tr,Nevin Durmaz,75.0,53.0,400.0,False
2615032000.0,Sarra-Ketevan Potova,SPotova,en,Ermioni Apostolaki,64.0,48.0,271.0,False
3130168000.0,Mete Kurt,MeteMk2,tr,Meltem Kilic,66.0,114.0,705.0,False
8.050941e+17,Paris Rehberim,paris_rehberim,tr,! SEPHER,60.0,176.0,981.0,False


In [9]:
sehir_orgs = twitter_users[twitter_users.is_org==True].drop(labels=["is_org"], axis=1)
sehir_users = twitter_users[twitter_users.is_org==False].drop(labels=["is_org"], axis=1)

In [10]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1222491000.0,Şehir FM,sehir_fm,tr,Sehir FM,88.0,1018.0,85.0
7.633393e+17,Şehir'16 Girişliler,Sehir16Giris,tr,Sehir Go,68.0,46.0,129.0
7.963099e+17,ŞehirMYO,SehirMYO,tr,Sehir FM,75.0,127.0,35.0
2362373000.0,isim şehir insan,isi_radyo,tr,! Isif,68.0,80.0,89.0
8.355337e+17,IEEE Şehir WIE,IEEESEHIRWIE,tr,Aisec Sehir,61.0,31.0,29.0


In [11]:
sehir_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1543669000.0,Erkan Ahiska,erkan_ahiska,tr,Furkan Ahiska,80.0,1842.0,1925.0
7.74255e+17,Serhan Doğrusöz,SerhanDogrusoz,tr,Serhan Doner,69.0,0.0,99.0
193458500.0,U.,usamevahap,tr,! Some,68.0,309.0,1289.0
430180500.0,Emre Bıyık,embiyoo,tr,! MYO Akademi,61.0,11.0,633.0
238491400.0,sıla ulutaş,silaulutas,tr,Ilayda Ulubas,70.0,39.0,294.0


In [12]:
truncate = lambda x: int(str(int(x))[:9])

In [13]:
resetted = twitter_users.reset_index()
index = resetted.id.apply(truncate)
resetted.rename(columns={"id":"tw_id"}, inplace=True)
resetted.index = index

twitter_users = resetted
twitter_users.sample(5)

Unnamed: 0_level_0,tw_id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
269320199,2693202000.0,ben sen,MuthisBen,tr,Umut Mise,67.0,23.0,382.0,False
71517426,71517430.0,geeseam,geeseam,tr,Enes Elamri,67.0,49.0,637.0,False
995432227,995432200.0,NGBEDE EDACHE GREAT,NGBEDEDACH,en,Fiona Gedeon Achi,63.0,8.0,39.0,False
215551583,2155516000.0,selcukyeni,selcukyeni1,en,Selcuk Yasan,70.0,212.0,1334.0,False
353565345,353565300.0,Resul Bozyel,ResulBozyel1,tr,Resul Oguz,64.0,73.0,819.0,False


In [14]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [15]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [16]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    twitter_users[int(i*(len(twitter_users)/8)):int((i+1)*(len(twitter_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)





took:  479.7198348045349


In [17]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

There are 20674 matches


Unnamed: 0,match_name,screen_name
1611,"[(rfaat al saman, 64)]",AsemAljaradat
10366,"[(se test, 56)]",SinefestoTV
16347,"[(yasin salmaz, 59)]",palomakitap
12368,"[( bilgi edinme, 59)]",bilgidt
13808,"[(narin top, 62)]",NATO


In [18]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
5038,ahmet sahin,sahinatli49,67
19753,! cs-search,switchcasenet,60
3456,vehbi metin demir,EgitimSinavRehb,57
9372,ayse nur alkan,aysenurakca,85
14232,nur eksi,golenarr,62


In [19]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [20]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
index = twitter_users.tw_id.apply(truncate)
twitter_users["id"] = index
twitter_users.set_index("id", inplace=True)
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
473067660,ali sarman,Aziz__Salman,76,473067700.0,Aziz Salman,tr,41.0,281.0,False
749357078,halil buyuksaka,HBuyukbayraktar,74,749357100.0,Halis buyukbayraktar,tr,13.0,69.0,False
101751128,merve akcay,k_munevver,67,1017511000.0,Münevver Kıtay,tr,60.0,299.0,False
595551414,mustafa yaylali,yavasmu,79,595551400.0,Mustafa Yavaş,en,141.0,798.0,False
589915132,cigdem koruk,morukbukowski,60,589915100.0,pismoruk,tr,35.0,291.0,False


In [21]:
twitter_users.to_csv("../datasets/twitter_users.csv", index_label="id")

In [24]:
filtered_twu = twitter_users[twitter_users.match_ratio>85]
filtered_twu.to_csv("../datasets/twitter_users.csv", index_label="id")

In [25]:
len(twitter_users), len(filtered_twu)

(20674, 2090)

In [26]:
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
505464968,omercan kacar,omercan94,92,505464968.0,Ömercan KAÇAR,tr,128.0,215.0,False
287896846,muhammet emin tak,demirtash_37,86,287896846.0,Muhammet DEMİRTAŞ,tr,630.0,2786.0,False
236749440,ahmet er,Ahmet_Mert79,89,236749440.0,Ahmet Mert,tr,635.0,2565.0,False
477970480,muhammed said baktir,MuhammedBaktir,95,477970480.0,Muhammed Said Baktır,tr,0.0,192.0,False
741129642,sacit cesitcioglu,sacitces,100,741129642.0,Sacit Cesitcioglu,tr,137.0,24.0,False


## Construct the network

In [114]:
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
5168,756236811190624257,106086098,{'2018.05.08': True}
15566,2805326734,3892757176,{'2018.05.08': True}
21088,1557759132,1110823566,{'2018.05.08': True}
9245,1222821175,106086098,{'2018.05.08': True}
26859,174415744,455903388,{'2018.05.08': True}


In [112]:
G = nx.DiGraph()
for _, row in user_connections.iterrows():
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in filtered_twu.index and to in filtered_twu.index:
        G.add_edge(from_, to)

In [68]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twitter_users.loc[node]
    for aug in augs:
        if type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [115]:
len(G.nodes())

0

In [54]:
len(G.edges())

2243

In [116]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [117]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)

In [118]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank}

In [125]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [130]:
G.nodes[595297280]

{'betweenness': 0.0,
 'closeness_centrality': 0.0,
 'degree': 1,
 'eigenvector_centrality': 8.290679621515326e-35,
 'followers_count': 71.0,
 'friends_count': 310.0,
 'in_degree': 0,
 'match_name': 'ayhan turkoglu',
 'name': 'talha turkoglu',
 'out_degree': 1,
 'pagerank': 0.0008750669358888037,
 'parity': 0,
 'screen_name': 'talha_turkoglu'}

In [120]:
list(G.nodes(data=True))[0]

(595297280,
 {'betweenness': 0.0,
  'closeness_centrality': 0.0,
  'degree': 1,
  'eigenvector_centrality': 8.290679621515326e-35,
  'followers_count': 71.0,
  'friends_count': 310.0,
  'in_degree': 0,
  'match_name': 'ayhan turkoglu',
  'name': 'talha turkoglu',
  'out_degree': 1,
  'pagerank': 0.0008750669358888037,
  'parity': 0,
  'screen_name': 'talha_turkoglu'})

In [122]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [75]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum(
        [(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum(
        [int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    return cross_edges/float(len(nw.edges())), heterogeneity_fraction_norm

In [76]:
homophily(G)

(0.00490414623272403, 0.010531550932622258)