In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast
import random
import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.WRatio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=0):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=0, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
7247,558574854,Gülhan Ertürk Akgül,GulhanErturk,tr,Gulden Arturk,72,160,801
453,982161549941915649,Cavit Oktay Elvan,ElvanCavit,tr,Elvan Cati,80,5,22
20288,1591696585,Seval Tuncer,svltncr,tr,Justin Crawford,64,58,175
12282,109989715,Metin Feyzioğlu,metinfeyzioglu,tr,Melih Terzioglu,69,2286654,8000
11499,444826898,Maltepe Eğitim,MaltepeEgitim,tr,! MLL,60,76,157


In [5]:
no_sehir = lambda x:"sehir" in clean(x)

In [6]:
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna().set_index("id")
#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
456484000.0,aşir keçelioğlu,DrAsir19300,tr,Muhammed Rasid Gurler,50.0,547.0,1316.0
1668194000.0,Şehirian,Sehirian,en,Sehriban Bozan,79.0,3.0,5.0
7.702825e+17,Agenzia Traduzione,AGTraduElAmal,fr,Omar Abdelrahman Mohamed,62.0,55.0,570.0
2370130000.0,erol kartaloğlu,KartalogluE,tr,Elif Beyza Karaalioglu,74.0,62.0,867.0
247944600.0,elif gmslr,eeelliiffff,tr,! Isif,68.0,25.0,114.0


In [7]:
twitter_users["is_org"] = twitter_users.name.apply(no_sehir)
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
531442200.0,Gültekin,gltekin18,tr,Selahattin Gultekin,74.0,13.0,118.0,False
2155637000.0,Büşra Karakaş,bsraakarakas,tr,Busra Karakas,88.0,183.0,484.0,False
2332576000.0,volkan yayalar,vyayalar,tr,İnsan Kaynakları,68.0,36.0,290.0,False
9.391785e+17,Can Ökçü,Cank73236728,tr,AB7_7203 (72),50.0,5.0,80.0,False
9.362036e+17,Yasin Sakalli,yasin_sakalli,tr,Yasemin Kaleli,74.0,0.0,74.0,False


In [8]:
sehir_orgs = twitter_users[twitter_users.is_org==True].drop(labels=["is_org"], axis=1)
sehir_users = twitter_users[twitter_users.is_org==False].drop(labels=["is_org"], axis=1)

In [9]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2784234000.0,Şehir AGT,AGDSehirUniv,en,Istanbul Sehir Universitesi,68.0,134.0,162.0
1421121000.0,Şehir İşletme Bölümü,sehirisletme,tr,! Isletme,90.0,106.0,15.0
2863615000.0,SehirLi_Olmak,sehirli_7,tr,Muhammed Talha Yenisehirlioglu,70.0,0.0,13.0
8.108538e+17,AGT Şehir Hanımlar,agdsehir,tr,Sehir KAD,78.0,11.0,13.0
543398800.0,Şehir Düşünce Merkezi,sehirdusuncemer,tr,Sehir User1,69.0,1056.0,521.0


In [10]:
sehir_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8.094439e+17,Nesibe Kurtulmuş,NesibeKurtulmus,tr,Taha Enes Kurtulmus,71.0,86.0,759.0
2329385000.0,Muhabir TR,muhabirTR,tr,! Muhaberat,78.0,692.0,4905.0
73584820.0,Bülent Ayanoğlu,bulentayanoglu,tr,Bulent Sayan,77.0,103.0,406.0
3306650000.0,Açanal Planlama,AcanalPlanlama,tr,Canan Yilmaz,62.0,155.0,597.0
9.663714e+17,Hasan Basri Kökcü,hasanbasrikokcu,tr,Hasan Basri Teke,71.0,58.0,154.0


In [11]:
truncate = lambda x: int(str(int(x))[:9])

In [12]:
resetted = twitter_users.reset_index()
index = resetted.id.apply(truncate)
resetted.rename(columns={"id":"tw_id"}, inplace=True)
resetted.index = index

twitter_users = resetted
twitter_users.sample(5)

Unnamed: 0_level_0,tw_id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
170219937,1702199000.0,imarhukuku,imarhukuku_,tr,Ömer Faruk Çukur,64.0,116.0,358.0,False
277899672,277899700.0,Selver Türkay,selverturkay,en,Selcen Ozturkcan,64.0,62.0,249.0,False
408418666,4084187000.0,Muhtar,abdullahadem70,en,Abdullah Fadel,79.0,13.0,368.0,False
118398179,1183982000.0,IEEE-YÜ ÖĞRENCİ KOLU,IEEEYeditepe,tr,Emre Yildiztepe,67.0,570.0,187.0,False
233453615,2334536000.0,greywork,greywork_,tr,Irem Orak,56.0,10.0,3.0,False


In [13]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [14]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [15]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    twitter_users[int(i*(len(twitter_users)/8)):int((i+1)*(len(twitter_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)





took:  1959.82204246521


In [16]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

There are 20674 matches


Unnamed: 0,match_name,screen_name
11836,"[( ismail kara, 90)]",ismail63206146
12481,"[( global, 67)]",Gzllba
18973,"[(ihsan f. i. albittar albittar, 86)]",ismaillugur
12728,"[( global, 90)]",urbanhist
20494,"[(ieee kulubu, 86)]",gtuieee


In [17]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
6500,fatma neslihan tutuncu,FDurgungoz,86
7351,deniz haj abrahim,muhammathamza,86
18454,muge akarsu,bngisuab,90
16646,edanur saluk,eaudeluna,82
9375,eda kurt,Emineedahasta,90


In [18]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [19]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
index = twitter_users.tw_id.apply(truncate)
twitter_users["id"] = index
twitter_users.set_index("id", inplace=True)
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
345917380,mehmet baran85,hyrlisibeglm,67,345917400.0,EMBaran,tr,1410.0,211.0,False
95451420,rukiye ozturk,yesimgizer,68,95451420.0,yeşo,tr,473.0,1637.0,False
748087265,zulal icoz,ilci_zulal,74,7.480873e+17,Zülal Ilci,en,9.0,61.0,False
388385729,fatma eslem akbiyik,eslemuzunkaya,86,388385700.0,eslem yıldız,tr,8.0,245.0,False
294990622,yasemin atagul,YSMNSLK,90,294990600.0,yasemin,tr,162.0,605.0,False


In [20]:
twitter_users.to_csv("../datasets/twitter_users.csv", index_label="id")

In [15]:
twitter_users = pd.read_csv("../datasets/twitter_users.csv", index_col="id")
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
933924224,murat dogruel,muratcelik8925,90,9.339242e+17,Murat,tr,3.0,71.0,False
367288942,huseyin bayar,huseynbayar,96,367288900.0,Hüseyin Bayar,tr,21.0,32.0,False
284399096,zeynep yilmaz,Zeynep10201,92,2843991000.0,Zeynep Yılmaz,tr,11.0,48.0,False
895191864,murat er,muratdergo,88,895191900.0,Murat Es,en,141.0,988.0,False
119642728,omer bahca,omeres,90,119642700.0,omer,tr,6.0,32.0,False


In [16]:
filtered_twu = twitter_users[twitter_users.match_ratio>86]
filtered_twu.to_csv("../datasets/twitter_users.csv", index_label="id")

In [17]:
len(twitter_users), len(filtered_twu)

(4509, 4509)

In [19]:
filtered_twu[filtered_twu.is_org==True]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
111082356,sehir sehir,sehirlibrary,95,1110824000.0,Sehir Library,tr,911.0,43.0,True
106086098,åehir ãniversitesi,SehirUniversite,94,106086100.0,ŞEHİR Üniversitesi,tr,12195.0,166.0,True
819269230,sehir sehir,sehir_alumni,95,8.192692e+17,Sehir Alumni,en,275.0,5.0,True
122249140,sehir fm,sehir_fm,88,1222491000.0,Şehir FM,tr,1018.0,85.0,True
841682908,! sehir kariyer fest,sehircareerfest,94,8.416829e+17,Şehir Kariyer Fest,tr,241.0,220.0,True
153699537,! sosyoloji,socsehir,90,1536995000.0,ŞEHİR Sosyoloji,en,2992.0,39.0,True
411742296,! felsefe,SehirFelsefe,90,4117423000.0,ŞEHİR Felsefe Bölümü,tr,177.0,44.0,True
847741204,sehir cycling club,SehirCycling,94,8.477412e+17,Şehir Cycling Club,tr,123.0,92.0,True
174415744,sehir sehir,sehiredebiyat,95,174415700.0,sehir edebiyat,en,2070.0,287.0,True
826846007,sehir atolye,sehiratolye,87,8.26846e+17,Şehir Atölye,tr,155.0,38.0,True


In [20]:
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
837023808,ahmet taha celik,tahacelik0,97,8.370238e+17,Ahmet Taha Çelik,tr,94.0,156.0,False
224370020,muhammet ali demir,MuhammetCorekci,90,224370000.0,Muhammet,en,118.0,4995.0,False
764867768,edanur develi,EdaDeveli2,87,7.648678e+17,Eda Develi,tr,5.0,153.0,False
714581381,hatice aygen,AygenderyaAygen,90,714581400.0,Aygen,tr,265.0,1145.0,False
288922401,bt-takvim,ahmedkandemir,90,288922400.0,AK,tr,347.0,1323.0,False


In [21]:
filtered_twu[filtered_twu.match_name.str.contains("ammar")]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
972967833,ammar yasir tikac,AMMAR29781301,90,9.729678e+17,AMMAR,en,2.0,12.0,False
291122559,ammar rasid,AmmarRashed_MB,87,291122600.0,Ammar Rashed,en,385.0,335.0,False
298110524,ammar yasir tikac,ammarnatouf,90,2981105000.0,ammar,ar,19.0,203.0,False
986265062,osamah al-ghammari,Sam_Al_Ghammari,91,986265100.0,Sam Al-Ghammari,en,1071.0,167.0,False
883352404,ammar yasir tikac,AMMAR77891998,90,8.833524e+17,AMMAR,ar,23.0,37.0,False


# Random Connections

In [22]:
from datetime import datetime

In [23]:
def present_in_date(changes_dates, queried_date):
    """
    changes_dates = {d1:True, d2:False, d3:True} connection added or removed
    """
    if changes_dates:
        str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
        changes = sorted(changes_dates,key=lambda d: str2date(d))
        queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
        present = False
        for d in changes:
            if queried_date < str2date(d):
                break
            present = changes_dates[d]
        return present
    else:
        return False

In [87]:
ppp = pd.DataFrame.from_dict({0:[1,2,[3]],1:[4,5,[6]]},orient='index')
ppp.columns=list("abc")
ppp

Unnamed: 0,a,b,c
0,1,2,[3]
1,4,5,[6]


In [88]:
for i,k in enumerate(cons_dict):
    print(i,k)
    break

0 (552748176, 981894449)


In [91]:
def connections_dict2df(cons):
    d = {i:[k[0],k[1], cons[k]] for i,k in enumerate(cons)}
    df = pd.DataFrame.from_dict(d, orient='index')
    df.columns=["from_","to","dates"]
    return df

In [66]:
def generate_cons_dict(nodes, edges_per_date=None, portion=2.5, dates=["2018.05.%02d"%x for x in range(1,31)], remove_edge_prob=0.5):
    random.shuffle(dates)
    start = time.time()
    if edges_per_date is None:
        edges_per_date = int(len(filtered_twu)//portion)
        
    random_connections = dict()  # {(from, to):{date1:True, date2:False, ...}}
    connection_indices = {}
    
    for d in dates:
        for i in range(edges_per_date):
            from_ = nodes.sample(1).index[0]
            to = nodes.sample(1).index[0]
            if from_!=to:
                random_connections.setdefault((from_, to), {})
                das = random_connections[(from_, to)]
                if present_in_date(das, d):
                    if np.random.random()>remove_edge_prob:
                        random_connections[(from_, to)][d] = False
                else:
                    random_connections[(from_, to)][d] = True
    
    print("took: ",time.time()-start)
    return random_connections

In [70]:
cons_dict = generate_cons_dict(filtered_twu)

took:  36.006311655044556


In [71]:
len(cons_dict)

53984

In [92]:
start = time.time()
random_connections = connections_dict2df(cons_dict)
print("took:", time.time()-start)
random_connections.to_csv("../datasets/random_connections.csv")
random_connections.head(5)

took: 0.09277224540710449


Unnamed: 0,from_,to,dates
39435,261372654,916911351,{'2018.05.06': True}
15496,246775947,780137530,{'2018.05.17': True}
16907,252265677,379866490,{'2018.05.18': True}
38763,156128304,193458531,{'2018.05.15': True}
38837,312017848,325521030,{'2018.05.07': True}


In [98]:
random_connections.loc[236]

from_                                    896317837
to                                       496771843
dates    {'2018.05.15': False, '2018.05.04': True}
Name: 236, dtype: object

## Construct the network

In [114]:
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
5168,756236811190624257,106086098,{'2018.05.08': True}
15566,2805326734,3892757176,{'2018.05.08': True}
21088,1557759132,1110823566,{'2018.05.08': True}
9245,1222821175,106086098,{'2018.05.08': True}
26859,174415744,455903388,{'2018.05.08': True}


In [112]:
G = nx.DiGraph()
for _, row in user_connections.iterrows():
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in filtered_twu.index and to in filtered_twu.index:
        G.add_edge(from_, to)

In [68]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twitter_users.loc[node]
    for aug in augs:
        if type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [115]:
len(G.nodes())

0

In [54]:
len(G.edges())

2243

In [116]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [117]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)

In [118]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank}

In [125]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [130]:
G.nodes[595297280]

{'betweenness': 0.0,
 'closeness_centrality': 0.0,
 'degree': 1,
 'eigenvector_centrality': 8.290679621515326e-35,
 'followers_count': 71.0,
 'friends_count': 310.0,
 'in_degree': 0,
 'match_name': 'ayhan turkoglu',
 'name': 'talha turkoglu',
 'out_degree': 1,
 'pagerank': 0.0008750669358888037,
 'parity': 0,
 'screen_name': 'talha_turkoglu'}

In [120]:
list(G.nodes(data=True))[0]

(595297280,
 {'betweenness': 0.0,
  'closeness_centrality': 0.0,
  'degree': 1,
  'eigenvector_centrality': 8.290679621515326e-35,
  'followers_count': 71.0,
  'friends_count': 310.0,
  'in_degree': 0,
  'match_name': 'ayhan turkoglu',
  'name': 'talha turkoglu',
  'out_degree': 1,
  'pagerank': 0.0008750669358888037,
  'parity': 0,
  'screen_name': 'talha_turkoglu'})

In [122]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [75]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum(
        [(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum(
        [int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    return cross_edges/float(len(nw.edges())), heterogeneity_fraction_norm

In [76]:
homophily(G)

(0.00490414623272403, 0.010531550932622258)