In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast

import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.token_sort_ratio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=0):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=0, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
15280,842967084,sultan güzeldal,sultangzl,tr,Sultan Altinsoy,70,117,309
3469,376935491,Celal AKIL,kaycemin,tr,Turkay Cem Ozbek,68,194,724
7614,1040273364,Davet Öğr. Topluluğu,davettoplulugu,tr,! MLL,60,228,156
15060,907972666342158336,Ekin bulut,eknekneknekn,tr,Melike Eken,61,21,266
3857,3289880226,TurkMigWatch,turkmigwatch,tr,Onur Kamat,55,1575,2042


In [5]:
no_sehir = lambda x:"sehir" in clean(x)

In [81]:
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna().set_index("id")
#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
467497600.0,Animder,animder,tr,Almula Camdereli,64.0,255.0,147.0
267138100.0,öznur hasanoğlu,oznurhasanoglu,tr,Oznur Hasanoglu,97.0,264.0,178.0
767649700.0,Ayşe Müge Hatipoğlu,a_mugeline,tr,Cagla Ince,60.0,21.0,69.0
1613436000.0,Merve Tekinir,mervetekinir,tr,Merve Ertekin,80.0,135.0,337.0
253259300.0,ahmet uğurlu,siyadari,tr,Kamila Iskyandarova,68.0,8.0,48.0


In [82]:
twitter_users["cleaned_name"] = twitter_users.name.apply(no_sehir)
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,cleaned_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9.265097e+17,LIMUZIN KIRALAMA,limuzinx,tr,Adel Salimullin,68.0,602.0,2670.0,False
1599142000.0,nes,nslhnsrkn06,tr,Neslihan Keskin,62.0,91.0,469.0,False
75852730.0,VEHBİ MEŞİN,MesinlerinOglu,tr,Melih Terzioglu,69.0,1147.0,1125.0,False
307866600.0,Enes Tursun,Enes_Tursun,tr,Enes Uzun,70.0,17.0,234.0,False
1562567000.0,Emel,emelerguden,tr,Emel Elgun,76.0,2.0,5.0,False


In [83]:
sehir_orgs = twitter_users[twitter_users.cleaned_name==True].drop(labels=["cleaned_name"], axis=1)
sehir_users = twitter_users[twitter_users.cleaned_name==False].drop(labels=["cleaned_name"], axis=1)

In [84]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
929518800.0,Bahçeşehir Final,fddbahcesehir,tr,Sehir Dance,71.0,159.0,150.0
8.062185e+17,Şehir İnsani Yardım,Sehirinsaniyrdm,tr,Sehir AID,67.0,56.0,1.0
884619600.0,Sehirinternational,SehirIC,en,Sehir Sehir,86.0,51.0,84.0
2328412000.0,ŞEHİR YBM | Yaşam Boyu Öğrenim Merkezi,SehirYBM,tr,Sehir FM,75.0,127.0,269.0
2207631000.0,Şehir Genç Yeşilay,sehiryesilay,tr,Genc Yesilay,67.0,542.0,85.0


In [85]:
sehir_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7.697966e+17,Hûmeyra,humeyra_ald,tr,Humeyra Bildik,72.0,76.0,400.0
2435141000.0,Anas Faisal,AnasObaidi,en,Anas Khalid,67.0,18.0,77.0
2204638000.0,ali vantal,alivantal,tr,Aslihan Al,74.0,2.0,18.0
2647863000.0,Tweet Fenomens,tfenos,tr,Ayten Ozturk,60.0,66.0,1602.0
1379474000.0,Selin Dizbay,Selindizbay,tr,Selin Dinc,67.0,243.0,558.0


In [86]:
truncate = lambda x: int(str(int(x))[:9])

In [92]:
resetted = sehir_users.reset_index()
index = resetted.id.apply(truncate)
resetted.rename(columns={"id":"tw_id"}, inplace=True)
resetted.index = index

twitter_users = resetted
twitter_users.sample(5)

Unnamed: 0_level_0,tw_id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
601920351,601920400.0,Yusuf Ziya,yusufziyaaktas,tr,Yusuf Islam Akar,67.0,209.0,2191.0
359206390,359206400.0,M.Erbulmus,m_erbulmus,tr,Omer Gumus,70.0,386.0,533.0
111048322,111048300.0,astroloji günlükleri,twitastrolojii,tr,! Psikoloji,60.0,261.0,1510.0
940583073,9.405831e+17,ali,ali95095523,tr,! Kaib,45.0,0.0,105.0
482082228,482082200.0,Simon Baker,higherbaker,en,Omer Bakaner,61.0,5571.0,350.0


In [93]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [94]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [15]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    twitter_users[int(i*(len(twitter_users)/8)):int((i+1)*(len(twitter_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)





took:  486.2190420627594


In [99]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

There are 20497 matches


Unnamed: 0,match_name,screen_name
7029,"[( ! soc-grads, 50)]",istanbul_foto
20257,"[(atakan cicek, 75)]",atakangider
4411,"[(fatma subasi, 50)]",abdullah29ats
19577,"[(baran yesilkaya, 65)]",YesilayBagcilar
2772,"[(aysegul simsek, 83)]",GulSims


In [100]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
1624,valid hamza,HamzaValid,100
11903,islam najeh ali ahmed,ibodirector,62
12644,ayse ulu,yuksel8015,62
11902,global,Gzllba,67
20485,ramazan oduncu,rdurgut,79


In [101]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [103]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
index = twitter_users.tw_id.apply(truncate)
twitter_users["id"] = index
twitter_users.set_index("id", inplace=True)
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
869886905,sena sen,senackgzz_,67,8.698869e+17,Sena,tr,72.0,319.0
156001546,! tarih,tarihdergi,100,1560015000.0,#tarih,en-gb,145050.0,100.0
319300615,buket ergul,Buketim1903,67,319300600.0,Buket kırömeroğlu,tr,62.0,41.0
797971200,nur aydin,NurullahArdic,67,797971200.0,N. A R D I Ç,en,6312.0,453.0
912596191,semih deniz,apojiez,59,9.125962e+17,decolonizeyomind 🌹,en,42.0,603.0


In [104]:
twitter_users.to_csv("../datasets/twitter_users.csv", index_label="id")

In [105]:
len(twitter_users)

20497

In [106]:
filtered_twu = twitter_users[twitter_users.match_ratio>85]
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
612776757,arafat gurbuz,gurbuzarafat,87,612776800.0,ArafatGürbüz,tr,46.0,102.0
331717933,ebubekir caglar,CaglarEbubekir,90,3317179000.0,Ebubekir Çağlar,tr,322.0,676.0
463648040,ozle cetinkaya,OzleCetinkaya,92,4636480000.0,Özle Çetinkaya,en,4.0,37.0
318416002,berna yilmaz,busrayilmaz326,87,3184160000.0,Büşra YILMAZ,tr,141.0,698.0
161800697,ahmed dirar gungordu,AD_gungordu,86,1618007000.0,ahmed dırar güngördü,tr,492.0,850.0


In [107]:
len(filtered_twu)

2084

## Construct the network

In [114]:
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
5168,756236811190624257,106086098,{'2018.05.08': True}
15566,2805326734,3892757176,{'2018.05.08': True}
21088,1557759132,1110823566,{'2018.05.08': True}
9245,1222821175,106086098,{'2018.05.08': True}
26859,174415744,455903388,{'2018.05.08': True}


In [112]:
G = nx.DiGraph()
for _, row in user_connections.iterrows():
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in filtered_twu.index and to in filtered_twu.index:
        G.add_edge(from_, to)

In [68]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twitter_users.loc[node]
    for aug in augs:
        if type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [111]:
len(G.nodes())

0

In [54]:
len(G.edges())

2243

In [116]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [117]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)

In [118]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank}

In [125]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [130]:
G.nodes[595297280]

{'betweenness': 0.0,
 'closeness_centrality': 0.0,
 'degree': 1,
 'eigenvector_centrality': 8.290679621515326e-35,
 'followers_count': 71.0,
 'friends_count': 310.0,
 'in_degree': 0,
 'match_name': 'ayhan turkoglu',
 'name': 'talha turkoglu',
 'out_degree': 1,
 'pagerank': 0.0008750669358888037,
 'parity': 0,
 'screen_name': 'talha_turkoglu'}

In [120]:
list(G.nodes(data=True))[0]

(595297280,
 {'betweenness': 0.0,
  'closeness_centrality': 0.0,
  'degree': 1,
  'eigenvector_centrality': 8.290679621515326e-35,
  'followers_count': 71.0,
  'friends_count': 310.0,
  'in_degree': 0,
  'match_name': 'ayhan turkoglu',
  'name': 'talha turkoglu',
  'out_degree': 1,
  'pagerank': 0.0008750669358888037,
  'parity': 0,
  'screen_name': 'talha_turkoglu'})

In [122]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [75]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum([(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum([int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    return cross_edges/float(len(nw.edges())), heterogeneity_fraction_norm

In [76]:
homophily(G)

(0.00490414623272403, 0.010531550932622258)