In [1]:
import psycopg2

import pandas as pd
import numpy as np

from copy import deepcopy

import ast
import random
import networkx as nx
import time, unicodedata
import itertools

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from joblib import Parallel, delayed

In [2]:
def clean(name, min_len=5, junk_replacement=''):
    try:
        cleaned = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")
    except TypeError:
        return junk_replacement
    if len(cleaned) < min_len:
        return junk_replacement
    return cleaned

def get_matches_edit_distance(item, choices, limit, scorer=fuzz.WRatio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=1):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('screen_name')
    start = time.time()
    for screen_name in twitter_users['screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[screen_name]['name']
        match_name = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
#         if counter %100 == 0:
#             print(counter, "out of ", len(twitter_users))
#             start_ = time.time()
#             print(start_-start, "seconds")
#             start = start_
        yield (screen_name, match_name)
        
def filter_matches_by_threshold(matches_dict, threshold=70):
    filtered_dict = dict()
    for screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=70, limit=1):
    matches = {screen_name : match_name for screen_name, match_name in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'screen_name': list(screen_names),
                         'match_name': [filtered_matches[screen_name] for screen_name in screen_names]})

In [3]:
connection = psycopg2.connect('dbname=link_formation host=localhost user=postgres password=1_sehir_1')

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [4]:
truncate = lambda x: int(str(int(x))[:9])

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
twitter_users = twitter_users.where(twitter_users.match_name.str.len()>6)\
                             .dropna()
twitter_users["truncated_id"] = twitter_users.id.apply(truncate)

#         .where(twitter_users.match_ratio>85)
# .where(~twitter_users.name.str.contains("(?i)sehir"))\
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,truncated_id
2774,9.206008e+17,Meryem Selva,meryem_selva,tr,Meryem Selva Ince,95.0,65.0,341.0,920600836
40547,2589457000.0,Tacettin Serkaya.,tacettin_sertka,tr,Ali Cetinkaya,69.0,4.0,136.0,258945691
5468,189881300.0,Merve Sancaktar,sancaktarmerve,tr,Merve Sancaktar,100.0,147.0,150.0,189881254
11955,62495280.0,asım devran,asimdevran,en,Burhan Devran,71.0,39.0,823.0,62495284
33026,7.667776e+17,Kadir Aşkın,kkadiraskin,tr,Kadir San,89.0,30.0,482.0,766777552


In [5]:
is_org = lambda x:"sehir" in clean(x)
twitter_users["is_org"] = twitter_users.screen_name.apply(is_org)
twitter_users.sample(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,truncated_id,is_org
7321,93385890.0,Nezih Erdogan,neziherdogan,en,Nezih Erdogan,100.0,484.0,243.0,93385894,False
49239,9.073842e+17,Celal Eren Çelik,yazparov,tr,Eren Yener,86.0,28459.0,7580.0,907384171,False
8534,111009300.0,ALİ MERCAN,mercanalim,en,Ali Ihsan Aydin,86.0,69.0,104.0,111009344,False
25454,8.194892e+17,Derin,nedemekderin,tr,Feyyaz Derinoglu,90.0,37432.0,31476.0,819489187,False
44961,24862760.0,BPS Research Digest,ResearchDigest,en,! Psy Search,72.0,80253.0,2086.0,24862758,False


In [6]:
sehir_orgs = twitter_users[twitter_users.is_org==True].set_index("id")
sehir_users = twitter_users[twitter_users.is_org==False]

In [7]:
len(sehir_orgs)

231

In [8]:
sehir_orgs.sample(5)

Unnamed: 0_level_0,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,truncated_id,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
450639500.0,Şehir Politika,sehirpolitika,en,Şehir Üniversitesi (İnsan ve Toplum Bilimleri...,86.0,877.0,817.0,450639507,True
2734041000.0,Şehir SBE,SehirSBE,tr,Şehir Üniversitesi (İnsan ve Toplum Bilimleri...,86.0,502.0,16.0,273404088,True
625463000.0,Şehir Sosyal Medya,sehirsosyal,tr,Şehir Üniversitesi (İnsan ve Toplum Bilimleri...,86.0,2240.0,1546.0,625463005,True
158316700.0,Semih BİLGE,eskisehir196526,tr,Efe Semih Demirtas,86.0,1465.0,1345.0,158316749,True
3334986000.0,Lord Varys,sehirveitiraf,tr,Selda Ünvar Yılmaz,63.0,7.0,78.0,333498586,True


In [9]:
sehir_users.head(5)

Unnamed: 0,id,name,screen_name,lang,match_name,match_ratio,followers_count,friends_count,truncated_id,is_org
1,567090000.0,Övünç Meriç,ovuncmeric,tr,Kardelen Meric,77.0,569.0,1170.0,567090020,False
4,726207600.0,klasik,klasikyayinlari,tr,Yavuz Kasikci,75.0,6596.0,142.0,726207614,False
5,497942800.0,Küre Yayınları,kureyayinlari,tr,Merve Yakinlar,72.0,9158.0,166.0,497942798,False
6,2674867000.0,Fatıma Tuba Yaylacı,fatimatubapetek,en,Fatima Tuba Yaylaci,94.0,844.0,246.0,267486658,False
7,1439589000.0,Mahmut Koca,mkoca66,en,Mahmut Koca,100.0,1379.0,44.0,143958858,False


In [10]:
sehir_directory = pd.read_csv('../datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [11]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [None]:
start = time.time()
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    sehir_users[int(i*(len(sehir_users)/8)):int((i+1)*(len(sehir_users)/8))],
    sehir_directory) for i in range(8))
print("took: ", time.time()-start)

In [None]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.sample(5)

In [480]:
sehir_matches_df['match_ratio'] = sehir_matches_df.match_name.apply(lambda x: x[0][1])
sehir_matches_df.match_name = sehir_matches_df.match_name.apply(lambda x: x[0][0])
sehir_matches_df.sample(5)

Unnamed: 0,match_name,screen_name,match_ratio
28057,fatma derya mentes,DeryaaSarii,90
10093,nur betã¼l yerli,bacimbilegin,90
20027,emine bayraktar,emine_blt_1,86
19182,etem hakan ergec,hakan_tunaa,86
3016,sami anis abuhamdeh,samiyigit_,86


In [481]:
tu=twitter_users.drop(labels=["match_name","match_ratio"], axis=1)

In [482]:
twitter_users = sehir_matches_df.merge(tu, on="screen_name")
twitter_users = twitter_users.set_index("id")
twitter_users.head(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,name,lang,followers_count,friends_count,truncated_id,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9.970584e+17,azize fatma cakir,fgurbuz35,86,fatma gürbüz,tr,68.0,122.0,997058428,False
9.369498e+17,okan mergen,miraokan42,86,miraç okan ekmekci,tr,4.0,19.0,936949783,False
2306379000.0,ä°stanbul åehir ãniversitesi ä°åletme enst...,CHPIstGenclik,86,CHP İstanbul Gençlik,tr,5735.0,3223.0,230637902,False
333417900.0,hanife kubra demirci,RabiaDeemirci,86,Rabia Demirci,tr,96.0,314.0,333417875,False
4742273000.0,gizem serpil boylu,kronik__rehber,86,serpil.sedef,tr,49.0,113.0,474227276,False


In [483]:
sehir_orgs.to_csv("../datasets/orgs.csv",index_label="id")
sehir_orgs.to_csv("../REST/static/orgs.csv",index_label="id")

In [484]:
filtered_twu = twitter_users[twitter_users.match_ratio>95]
filtered_twu = filtered_twu.append(twitter_users.loc[291122559])  # me: Ammar Rashed :)

In [485]:
len(twitter_users), len(filtered_twu)

(41768, 1087)

In [486]:
filtered_twu = pd.read_csv("../datasets/filtered_twitter_users.csv", index_col="id")
filtered_twu.sample(5)

Unnamed: 0_level_0,match_name,screen_name,match_ratio,name,lang,followers_count,friends_count,truncated_id,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7.822273e+17,anisa tahiri,anisatahiri,100,Anisa Tahiri,en,28.0,184.0,782227273,False
488335700.0,durali topal,DuraliTopal,100,Durali topal,tr,86.0,72.0,488335702,False
7.514779e+17,yunus emre albayrak,yemrealbyrk,100,Yunus Emre Albayrak,tr,73.0,73.0,751477926,False
1545633000.0,yasin saglam,YasinSaglam7,100,Yasin Saglam,en,54.0,161.0,154563318,False
140518800.0,koral ozgunay,koralozgunay,100,Koral Ozgunay,en,32.0,100.0,140518821,False


In [487]:
filtered_twu.loc[291122559]

match_name            ammar rasid
screen_name        AmmarRashed_MB
match_ratio                    87
name                 Ammar Rashed
lang                           en
followers_count               392
friends_count                 337
truncated_id            291122559
is_org                      False
Name: 291122559.0, dtype: object

In [488]:
filtered_twu[filtered_twu.screen_name.str.contains("kral")]

Unnamed: 0_level_0,match_name,screen_name,match_ratio,name,lang,followers_count,friends_count,truncated_id,is_org
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
14668733.0,ahmet bulut,kral,100,Ahmet Bulut,en,242.0,62.0,14668733,False


# Filter connections

In [489]:
twu_with_orgs = pd.concat([filtered_twu, sehir_orgs])
twu_with_orgs.sample(5)

Unnamed: 0_level_0,followers_count,friends_count,is_org,lang,match_name,match_ratio,name,screen_name,truncated_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
8.61932e+17,15.0,90.0,False,tr,ferhat basboga,100.0,Ferhat basboga,basbogaferhat,861931997
586809100.0,301.0,197.0,False,tr,sude beyaz,100.0,Sude Beyaz,SudeBeyaz_,586809124
37036730.0,525.0,374.0,False,en,omer celebi,100.0,Omer Celebi,omerclb,37036731
3207361000.0,10.0,16.0,False,tr,gulcan ayral,96.0,Gülcan Ayral,GulcanAyral,320736080
47148390.0,372.0,655.0,False,en,kasim kopuz,100.0,kasim kopuz,kkopuz,47148388


In [512]:
assert len(twu_with_orgs) == len(filtered_twu) + len(sehir_orgs)
len(twu_with_orgs)

1318

In [493]:
# user_connections.formation = user_connections.formation.apply(lambda x:{"2018.05.24":True})
user_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
82793,709418275445280768,995806488,{'2018.05.24': True}
53209,1092981798,818364804,{'2018.05.24': True}
15941,103291872,3892757176,{'2018.05.24': True}
39179,2906238288,4249483061,{'2018.05.24': True}
120,106086098,107141880,{'2018.05.24': True}


In [657]:
ids = set(twu_with_orgs.index)
def in_sehir(row, from_col="from_user_id", to_col="to_user_id"):
    return row[from_col] in ids and row[to_col] in ids

In [658]:
user_connections["in_sehir"] = user_connections.apply(lambda row: in_sehir(row), axis=1)
sehir_connections = user_connections[user_connections.in_sehir].drop("in_sehir", axis=1)
sehir_connections.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
12776,796309949768957952,2847233849,{'2018.05.24': True}
7284,237290209,106086098,{'2018.05.24': True}
35869,450639507,3064906390,{'2018.05.24': True}
36077,450639507,1079409680,{'2018.05.24': True}
62919,2328411684,162310009,{'2018.05.24': True}


In [659]:
len(user_connections),len(sehir_connections)

(93296, 5237)

## Adding older connections

In [518]:
from datetime import datetime
def get_dates(cons):
    all_dates = set()
    str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08

    for dates in cons.formation.apply(lambda x: list(x)):
        for date in dates:
            all_dates.add(str2date(date))
    return [d.strftime('%Y.%m.%d') for d in sorted(all_dates)]

def present_in_date(changes_dates, queried_date):
    """
    changes_dates = {d1:True, d2:False, d3:True} connection added or removed
    """
    if changes_dates:
        str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
        changes = sorted(changes_dates,key=lambda d: str2date(d))
        queried_date = datetime.strptime(queried_date, '%Y.%m.%d')
        present = False
        for d in changes:
            if queried_date < str2date(d):
                break
            present = changes_dates[d]
        return present
    else:
        return False

In [500]:
con2 = psycopg2.connect('dbname=old host=localhost user=postgres password=1_sehir_1')

old_cons = pd.read_sql("SELECT * FROM twitter_connection", con2).drop('id', axis=1)

In [502]:
old_cons.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
12947,736829699918340096,1978734690,{'2018.05.08': True}
33100,552041002,737056442,{'2018.05.08': True}
5583,722494878177996800,106086098,{'2018.05.08': True}
27336,275030892,174415744,{'2018.05.08': True}
29969,302851095,847741204204634112,{'2018.05.08': True}


In [503]:
old_cons["in_sehir"] = old_cons.apply(lambda row: in_sehir(row), axis=1)
old_sehir = old_cons[old_cons.in_sehir].drop("in_sehir", axis=1)
old_sehir.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
156,106086098,405717491,{'2018.05.08': True}
8332,2644456425,106086098,{'2018.05.08': True}
36150,450639507,159967656,{'2018.05.08': True}
13199,239427912,736829699918340096,{'2018.05.08': True}
20502,1036789573,1666891914,{'2018.05.08': True}


In [504]:
len(old_cons),len(old_sehir)

(39848, 4652)

In [511]:
concat_cons = pd.concat([sehir_connections, old_sehir])
assert len(concat_cons) == len(old_sehir)+len(sehir_connections)
len(concat_cons)

10366

In [583]:
def optimize_dates(dates_):
    dates = {list(d.keys())[0]:d[list(d.keys())[0]] for d in dates_}
    str2date = lambda strdate: datetime.strptime(strdate, '%Y.%m.%d')  # 2018.05.08
    sorted_dates = sorted(dates, key=lambda d:str2date(d))
    optimized_dates = {sorted_dates[0]: True}
    for d in range(1, len(sorted_dates)):
        if dates[sorted_dates[d-1]] != dates[sorted_dates[d]]:
            optimized_dates[sorted_dates[d]] = dates[sorted_dates[d]]
    return str(optimized_dates)

In [586]:
grouped_cons = concat_cons.groupby(["from_user_id","to_user_id"])["formation"]\
                    .apply(lambda x:optimize_dates(x)).reset_index()
len(grouped_cons)

5728

In [588]:
len(old_sehir), len(sehir_connections)

(4652, 5714)

In [589]:
str2dict = lambda d : ast.literal_eval(d)
grouped_cons.formation = grouped_cons.formation.apply(lambda d: str2dict(d))
grouped_cons.sample(5)

Unnamed: 0,from_user_id,to_user_id,formation
103,54890014,3892757176,{'2018.05.08': True}
3005,2203396100,47148388,{'2018.05.24': True}
1344,450639507,174415744,{'2018.05.08': True}
4231,3434564488,334139315,{'2018.05.24': True}
801,266826023,106086098,{'2018.05.08': True}


In [675]:
twu_with_orgs.to_csv("../datasets/filtered_twitter_users.csv", index_label="id")
twu_with_orgs.to_csv("../REST/static/filtered_twitter_users.csv", index_label="id")

grouped_cons.to_csv("../datasets/filtered_twitter_connections.csv", index_label="id")
grouped_cons.to_csv("../REST/static/filtered_twitter_connections.csv", index_label="id")

## Construct the network

In [604]:
dates = get_dates(grouped_cons)
dates

['2018.05.08', '2018.05.24']

In [608]:
grouped_cons["first_date"] = grouped_cons.formation.apply(
    lambda ds: present_in_date(ds, dates[0]))
grouped_cons.sample(5, random_state=42)

Unnamed: 0,from_user_id,to_user_id,formation,first_date
4445,4354816696,113954902,{'2018.05.24': True},False
4118,3299537073,106086098,{'2018.05.08': True},True
3893,2940722500,955847747093057537,{'2018.05.08': True},True
4210,3407552944,334139315,{'2018.05.24': True},False
5603,945405496466575360,847741204204634112,{'2018.05.08': True},True


In [664]:
G = nx.DiGraph()
# for _, row in user_connections.iterrows():
for _, row in grouped_cons[grouped_cons.first_date==True].iterrows():    
    from_ = truncate(row["from_user_id"])
    to = truncate(row["to_user_id"])
    if from_ in twu_with_orgs.truncated_id and to in twu_with_orgs.truncated_id:
        G.add_edge(from_, to)

In [665]:
augs = ["name", "screen_name","match_name", "followers_count","friends_count", "lang"]
for node in G.nodes():
    user = twu_with_orgs.loc[node]
    for aug in augs:
        if aug=="lang":
            m = user[aug]
        elif type(user[aug])==str:
            m = clean(user[aug])
        else:
            m = user[aug]
        G.nodes[node][aug] = m

In [666]:
len(G.nodes())

404

In [667]:
len(G.edges())

771

In [668]:
for ix,deg in G.degree(G.nodes()):
    G.node[ix]['degree'] = deg
    G.node[ix]['parity'] = (1-deg%2)
    
for ix,in_deg in G.in_degree(G.nodes()):
    G.node[ix]['in_degree'] = in_deg
    
for ix,out_deg in G.out_degree(G.nodes()):
    G.node[ix]['out_degree'] = out_deg

In [669]:
evc = nx.eigenvector_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness = nx.betweenness_centrality(G)
pagerank = nx.pagerank(G)
nxg = G.to_undirected()
clustering = nx.clustering(nxg)

In [670]:
metrics = {"eigenvector_centrality":evc,
           "closeness_centrality":closeness,
          "betweenness":betweenness,
          "pagerank":pagerank,
          "clustering_coefficient":clustering}

In [671]:
for metric_name, metric in metrics.items():
    for ix,v in metric.items():
        G.nodes[ix][metric_name] = v

In [672]:
list(G.nodes(data=True))[0]

(396662786,
 {'betweenness': 0.0,
  'closeness_centrality': 0.0,
  'degree': 1,
  'eigenvector_centrality': 1.6665676702304377e-22,
  'followers_count': 33.0,
  'friends_count': 284.0,
  'in_degree': 0,
  'lang': 'tr',
  'match_name': ' sehir mba',
  'name': 'nemasehir',
  'out_degree': 1,
  'pagerank': 0.0004939318456630349,
  'parity': 0,
  'screen_name': 'nemasehir'})

In [673]:
import json
from networkx.readwrite import json_graph
data = nx.node_link_data(G)
with open('../REST/static/networks/twitter_users_graph2.json', 'w') as f:
    json.dump(data, f, indent=4)

## Calculating Homophily

In [625]:
def homophily(nw, metric="lang"):
    langs_probs = dict()
    for n in nw.nodes():
        user = nw.nodes[n]
        langs_probs.setdefault(user[metric], 0)
        langs_probs[user[metric]] += 1
    heterogeneity_fraction_norm = 1 - sum(
        [(float(i)/len(nw.nodes()))**2 for i in langs_probs.values()])
    cross_edges = sum(
        [int(nw.nodes[f][metric] != nw.nodes[t][metric] ) for f,t in nw.edges()])
    cross_metric_ratio = cross_edges/float(len(nw.edges()))
    print("cross-metric edges ratio: ", cross_metric_ratio)
    print("Heterogeneity Fraction Norm", heterogeneity_fraction_norm)
    return cross_metric_ratio < heterogeneity_fraction_norm

In [626]:
homophily(G)

cross-metric edges ratio:  0.48249027237354086
Heterogeneity Fraction Norm 0.454747083619253


False

# Transitivity

In [3]:
nx.transitivity(G)

0.016792666543739957