In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast
import random
import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.WRatio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=0):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=0, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
23869,575184191,Gamze Türk,Gaamzetrk,tr,Gamze Basturk,73,28,442
74,4091594428,Şehir Icebreakers,sehiricebreaker,en,Sehir Icebreakers Club,81,112,53
5339,1913211822,İAÜ,iaukahvesi,tr,Ches,68,1125,1918
21907,921108116770623488,Ramazan araz,Tatar_4472,tr,! Tarih,54,83,544
16052,1098804811,chori,ZeynebKaygusuz,tr,BK,90,21,261


In [5]:
no_sehir = lambda x:"sehir" in clean(x)

In [6]:
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna().set_index("id")
#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2166401000.0,ysfیوسف صیپاحی,ysufsph,tr,Yusuf Sahan,67.0,22.0,253.0
489637800.0,fasogbon timothy,fash_tee,en,Ahmad Rasheed,60.0,155.0,1200.0
8.466964e+17,Ebru Nisa,ebrunisakoc,tr,Melisa Koc,67.0,1.0,533.0
53357040.0,Onur Uysal,onur_uysal,en,Zeynep Binnur Unsal,63.0,2139.0,1597.0
472149200.0,ahmet öner,ahmetoner01,tr,Ahmet Er,74.0,264.0,845.0


In [7]:
twitter_users["is_org"] = twitter_users.name.apply(no_sehir)
twitter_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
8.332109e+17,Ceren Akbulut,zcerenakbulut,tr,Ceren Akbulut,92.0,54.0,136.0,False
376971400.0,Elif,elifhande_,tr,Elif Hande Hantumanli,81.0,66.0,100.0,False
9.196685e+17,Rewi,rewibu,tr,Jonathan Andrew Bull,75.0,0.0,306.0,False
4868439000.0,muttakii,muttaakii,tr,Mustafa Kirac,64.0,0.0,193.0,False
252966000.0,MustafaCRK,mustafacuruk18,en,Mustafa Durdu,74.0,103.0,209.0,False


In [8]:
sehir_orgs = twitter_users[twitter_users.is_org==True].drop(labels=["is_org"], axis=1)
sehir_users = twitter_users[twitter_users.is_org==False].drop(labels=["is_org"], axis=1)

In [9]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3065661000.0,Şehir Beyaz Hareket,sehirbeyazhd,tr,Sehir MBA,67.0,7.0,26.0
1599744000.0,ŞEHİR TTO,sehirtto,tr,Sehir Go,75.0,2426.0,448.0
9.706433e+17,Şehir Mentorluk,SehirMentor,tr,Sehir Mun,70.0,23.0,64.0
3350434000.0,Ataşehir Yeşilay,atasehiryesilay,tr,Elif Yesil,63.0,727.0,4980.0
8.111739e+17,ŞEHİR İYBF,SehirEntp,tr,! ENTP,90.0,5.0,14.0


In [10]:
sehir_users.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1886790000.0,ceyhun okutan,cyhn_oktn,tr,Ceyhun Okutan,73.0,104.0,639.0
2219466000.0,yildizonu,yildizonu,en,Olcay Akyildiz,72.0,46.0,234.0
7.323103e+17,Elif Talu,talu4183,tr,AB4_4103,50.0,0.0,24.0
2189985000.0,Hüseyin Koca,HasaneynKoca,tr,Hasan Kocaturk,69.0,155.0,435.0
893733600.0,Günce bakal,guncelli,tr,Mustafa Gunel,69.0,612.0,991.0


In [11]:
truncate = lambda x: int(str(int(x))[:9])

In [12]:
resetted = twitter_users.reset_index()
index = resetted.id.apply(truncate)
resetted.rename(columns={"id":"tw_id"}, inplace=True)
resetted.index = index

twitter_users = resetted
twitter_users.sample(5)

Unnamed: 0_level_0,tw_id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
115967045,1159670000.0,.,ortalamabirkul,tr,Farabi -,60.0,155.0,1003.0,False
889520086,8.895201e+17,Eğitim Neferi,andirgiracueyt,tr,Kadir Kir,60.0,32.0,315.0,False
215925090,2159251000.0,Derya,ToygarDerya,tr,! Arge,68.0,72.0,2226.0,False
846204973,846205000.0,Salih Ecevit,salihecevit,tr,Muhammed Salih Ecevit,82.0,78.0,94.0,False
432872372,432872400.0,emine,anka_carpediem,tr,! Arge,68.0,49.0,121.0,False


In [13]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [14]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [15]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    twitter_users[int(i*(len(twitter_users)/8)):int((i+1)*(len(twitter_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)





took:  1959.82204246521


In [16]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

There are 20674 matches


Unnamed: 0,match_name,screen_name
11836,"[( ismail kara, 90)]",ismail63206146
12481,"[( global, 67)]",Gzllba
18973,"[(ihsan f. i. albittar albittar, 86)]",ismaillugur
12728,"[( global, 90)]",urbanhist
20494,"[(ieee kulubu, 86)]",gtuieee


In [17]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
6500,fatma neslihan tutuncu,FDurgungoz,86
7351,deniz haj abrahim,muhammathamza,86
18454,muge akarsu,bngisuab,90
16646,edanur saluk,eaudeluna,82
9375,eda kurt,Emineedahasta,90


In [18]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [19]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
index = twitter_users.tw_id.apply(truncate)
twitter_users["id"] = index
twitter_users.set_index("id", inplace=True)
twitter_users.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
345917380,mehmet baran85,hyrlisibeglm,67,345917400.0,EMBaran,tr,1410.0,211.0,False
95451420,rukiye ozturk,yesimgizer,68,95451420.0,yeşo,tr,473.0,1637.0,False
748087265,zulal icoz,ilci_zulal,74,7.480873e+17,Zülal Ilci,en,9.0,61.0,False
388385729,fatma eslem akbiyik,eslemuzunkaya,86,388385700.0,eslem yıldız,tr,8.0,245.0,False
294990622,yasemin atagul,YSMNSLK,90,294990600.0,yasemin,tr,162.0,605.0,False


In [20]:
twitter_users.to_csv("../datasets/twitter_users.csv", index_label="id")

In [42]:
filtered_twu = twitter_users[twitter_users.match_ratio>86]
filtered_twu.to_csv("../datasets/twitter_users.csv", index_label="id")

In [43]:
len(twitter_users), len(filtered_twu)

(20674, 4509)

In [44]:
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
309935068,bunyamin cansev,BnyaminCansev,97,309935100.0,bünyamin cansev,tr,4.0,20.0,False
721650616,melisa akdag,melisa_sumeyye,90,7.216506e+17,Melisa,tr,2.0,81.0,False
174788643,ferhat ozgur catak,ferpznhat,90,1747886000.0,Ferhat,tr,366.0,986.0,False
67366087,! felsefe,IUFelsefe,90,67366090.0,İstanbul Felsefe,tr,7405.0,262.0,False
724253103,nur betã¼l yerli,bacimbilegin,90,7.242531e+17,Betül,tr,2.0,211.0,False


In [45]:
filtered_twu[filtered_twu.match_name.str.contains("ammar")]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,tw_id,name,lang,followers_count,friends_count,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
972967833,ammar yasir tikac,AMMAR29781301,90,9.729678e+17,AMMAR,en,2.0,12.0,False
291122559,ammar rasid,AmmarRashed_MB,87,291122600.0,Ammar Rashed,en,385.0,335.0,False
298110524,ammar yasir tikac,ammarnatouf,90,2981105000.0,ammar,ar,19.0,203.0,False
986265062,osamah al-ghammari,Sam_Al_Ghammari,91,986265100.0,Sam Al-Ghammari,en,1071.0,167.0,False
883352404,ammar yasir tikac,AMMAR77891998,90,8.833524e+17,AMMAR,ar,23.0,37.0,False


# Random Connections

In [64]:
from datetime import datetime

In [88]:
def present_in_date(changes_dates, queried_date):
    """
    changes_dates = {d1:True, d2:False, d3:True} connection added or removed
    """
    if changes_dates:
        str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
        changes = sorted(changes_dates,key=lambda d: str2date(d))
        queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
        present = False
        for d in changes:
            if queried_date < str2date(d):
                break
            present = changes_dates[d]
        return present
    else:
        return False

In [123]:
start = time.time()
num_edges = len(filtered_twu)
random_connections = pd.DataFrame(columns=["from_","to","dates"])

dates = ["2018.05.%02d"%x for x in range(1,31)]
random.shuffle(dates)
for d in range(len(dates)):
    for i in range(400):
        from_ = filtered_twu.sample(1).index[0]
        to = filtered_twu.sample(1).index[0]
        if from_!=to:
            q = random_connections[(random_connections.from_==from_) & (random_connections.to==to)]
            if len(q):
                for j in q.index:
                    da = random_connections.loc[j].dates
                    if present_in_date(da, dates[d]) and random.random()>0.5:
                        da [dates[d]]=False
                        random_connections.loc[j].dates = da
            else:
                random_connections = random_connections.append({"from_":from_, "to":to,"dates":{dates[d]:True}}, ignore_index=True)
print("took: ",time.time()-start)

took:  63.32909822463989


In [124]:
random_connections.to_csv("../datasets/random_connections.csv")
random_connections.sample(5)

Unnamed: 0,from_,to,dates
5598,246696613,293105127,{'2018.05.06': True}
953,626295637,28935945,{'2018.05.11': True}
7688,323790636,426075771,{'2018.05.22': True}
10602,771447100,749325615,{'2018.05.29': True}
9386,220304993,159540186,{'2018.05.09': True}


In [125]:
for _,i in random_connections.iterrows():
    if len(i.dates)>1:
        print(i)

## Construct the network

In [114]:
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
5168,756236811190624257,106086098,{'2018.05.08': True}
15566,2805326734,3892757176,{'2018.05.08': True}
21088,1557759132,1110823566,{'2018.05.08': True}
9245,1222821175,106086098,{'2018.05.08': True}
26859,174415744,455903388,{'2018.05.08': True}


In [112]:
G = nx.DiGraph()
for _, row in user_connections.iterrows():
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in filtered_twu.index and to in filtered_twu.index:
        G.add_edge(from_, to)

In [68]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twitter_users.loc[node]
    for aug in augs:
        if type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [115]:
len(G.nodes())

0

In [54]:
len(G.edges())

2243

In [116]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [117]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)

In [118]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank}

In [125]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [130]:
G.nodes[595297280]

{'betweenness': 0.0,
 'closeness_centrality': 0.0,
 'degree': 1,
 'eigenvector_centrality': 8.290679621515326e-35,
 'followers_count': 71.0,
 'friends_count': 310.0,
 'in_degree': 0,
 'match_name': 'ayhan turkoglu',
 'name': 'talha turkoglu',
 'out_degree': 1,
 'pagerank': 0.0008750669358888037,
 'parity': 0,
 'screen_name': 'talha_turkoglu'}

In [120]:
list(G.nodes(data=True))[0]

(595297280,
 {'betweenness': 0.0,
  'closeness_centrality': 0.0,
  'degree': 1,
  'eigenvector_centrality': 8.290679621515326e-35,
  'followers_count': 71.0,
  'friends_count': 310.0,
  'in_degree': 0,
  'match_name': 'ayhan turkoglu',
  'name': 'talha turkoglu',
  'out_degree': 1,
  'pagerank': 0.0008750669358888037,
  'parity': 0,
  'screen_name': 'talha_turkoglu'})

In [122]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [75]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum(
        [(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum(
        [int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    return cross_edges/float(len(nw.edges())), heterogeneity_fraction_norm

In [76]:
homophily(G)

(0.00490414623272403, 0.010531550932622258)