In [25]:
import psycopg2

import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from copy import deepcopy

import re, unicodedata

# from graph_tool.all import *

### parse sehir contacts directory (full names, emails)

In [2]:
sehir_directory = pd.read_csv('datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email']).dropna()

In [3]:
sehir_directory.head()

Unnamed: 0,First Name,Last Name,Primary Email
102,Burak,Fidan,burakfidan@std.sehir.edu.tr
103,Muhammed,Aydin,cihadaydin@std.sehir.edu.tr
104,Muhammet Masuk,Aktas,muhammetaktas@std.sehir.edu.tr
105,Sezer,Aktas,sezeraktas@std.sehir.edu.tr
106,Ayse Nur,Salk,aysesalk@std.sehir.edu.tr


In [23]:
"{} sehir contacts".format(len(sehir_directory))

'9088 sehir contacts'

In [4]:
def get_matches_edit_distance(item, choices, limit, scorer=fuzz.token_sort_ratio):
    return process.extract(item, choices, limit=limit, scorer=scorer)

def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=2):
    fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]
    
    twitter_user_by_screen_name = twitter_users.set_index('twitter_screen_name')
    
    for twitter_screen_name in twitter_users['twitter_screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[twitter_screen_name]['cleaned_twitter_name']
        sehir_matches = get_matches_edit_distance(twitter_name, fullnames, limit)
        
        yield (twitter_screen_name, sehir_matches)
        
def filter_matches_by_threshold(matches_dict, threshold=70):
    filtered_dict = dict()
    for twitter_screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[twitter_screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold, limit):
    matches = {screen_name : sehir_matches for screen_name, sehir_matches in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'twitter_screen_name': list(screen_names),
                         'sehir_matches': [filtered_matches[screen_name] for screen_name in screen_names]})

def clean(name):
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")

### connect to sql database

In [5]:
connection = psycopg2.connect('dbname=twitter_accounts host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)\
.rename(columns={'id': 'GUID', 
                 'name': 'twitter_name',
                 'description': 'profile_description',
                 'screen_name': 'twitter_screen_name'})

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [6]:
len(twitter_users)

79407

In [7]:
twitter_users["cleaned_twitter_name"] = twitter_users["twitter_name"].apply(lambda x:clean(x) if len(clean(x))>4 else np.NaN)

In [8]:
twitter_users = twitter_users.dropna()

In [9]:
len(twitter_users)

72090

In [10]:
twitter_users.head(2)

Unnamed: 0,GUID,twitter_name,twitter_screen_name,profile_description,favourites_count,statuses_count,lang,cleaned_twitter_name
0,106086098,ŞEHİR Üniversitesi,SehirUniversite,İstanbul Şehir Üniversitesi'nin Resmi Twitter ...,384,3342,tr,sehir universitesi
1,835028362032742400,Girişimcilik Kulübü,SehirGirisim,İstanbul Şehir Üniversitesi Girişimcilik Kulüb...,80,76,tr,girisimcilik kulubu


### user_connections contains follower/followee relationships by GUID

In [11]:
user_connections.head(2)

Unnamed: 0,from_user_id,to_user_id
0,106086098,835028362032742400
1,106086098,3627550275


### matching subset of sehir twitter accounts _ALL_

In [12]:
sehir_matches_df = get_matches_dataframe(twitter_users,
                                         sehir_directory, threshold=90, limit=2)



In [13]:
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.head(5)

There are 3000 matches


Unnamed: 0,sehir_matches,twitter_screen_name
0,"[(elif ozturk, 100)]",bidikshowtime
1,"[(mehmet ozdemir, 100)]",Mehmet317213
2,"[(ahmet emir, 95), (ahmet demir98, 92)]",ahmetde43241739
3,"[(mehmed celik, 92), (mehmed celik, 92)]",CelikMfc23
4,"[(meliksah aydin, 92)]",MelisaA85500624


In [14]:
merged = sehir_matches_df.merge(twitter_users, left_on='twitter_screen_name', right_on='twitter_screen_name')

In [15]:
merged[['GUID', 'twitter_screen_name', 'cleaned_twitter_name', 'sehir_matches']].tail(5)

Unnamed: 0,GUID,twitter_screen_name,cleaned_twitter_name,sehir_matches
2995,935107166381510657,EnesAkt68970583,enes akturk,"[(enis akturk, 91)]"
2996,378511292,Support91000048,adil sava,"[(adil sava, 100)]"
2997,938006900951658496,Hakanyigit46,hakan yigit,"[(hakan yigit, 100)]"
2998,935177579195195392,Mustafa30586247,mustafa gul,"[(mustafa gunel, 92), (mustafa gurel, 92)]"
2999,898917016324722689,HasanCskn61,hasan coskun,"[(hasan coskun, 100)]"


In [16]:
merged.to_csv('datasets/sehir_matches.csv', index_label="GUID")

In [17]:
merged = pd.read_csv('datasets/sehir_matches.csv', index_col="GUID")

In [18]:
merged.head(5)

Unnamed: 0_level_0,sehir_matches,twitter_screen_name,GUID.1,twitter_name,profile_description,favourites_count,statuses_count,lang,cleaned_twitter_name
GUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,"[('elif ozturk', 100)]",bidikshowtime,2924769794,Elif Öztürk,"Çanakkale Fen Lisesi'nde okumuş, Boğaziçi Üniv...",1,2,en,elif ozturk
1,"[('mehmet ozdemir', 100)]",Mehmet317213,936648794699116546,Mehmet Özdemir,,0,0,tr,mehmet ozdemir
2,"[('ahmet emir', 95), ('ahmet demir98', 92)]",ahmetde43241739,935555760322433025,ahmet demir,,0,9,tr,ahmet demir
3,"[('mehmed celik', 92), ('mehmed celik', 92)]",CelikMfc23,3311807019,Mehmet Celik,,226,9,de,mehmet celik
4,"[('meliksah aydin', 92)]",MelisaA85500624,934602584349986816,Melisa Aydin,,0,0,tr,melisa aydin


In [30]:
'{} twitter accounts have been matched successfully'.format(len(merged))

'3000 twitter accounts have been matched successfully'

In [28]:
# sehir_matches_by_guid = merged.set_index('GUID')

In [None]:
number_of_matches, _ = merged.shape

network = Graph(directed=True)
network.add_vertex(number_of_matches)

vprop = network.new_vertex_property("string")

for vertex, twitter_screen_name in zip(network.vertices(), merged['twitter_screen_name'].values):
    vprop[vertex] = twitter_screen_name
    
network.vertex_properties["twitter_screen_name"] = vprop

for i, row in user_connections.iterrows():
    from_ = row['from_user_id']
    to = row['to_user_id']
    
    try:
        network.add_edge(find_vertex(network, vprop, sehir_matches_by_guid.loc[from_]['twitter_name']), 
                         find_vertex(network, vprop, sehir_matches_by_guid.loc[to]['twitter_name']))
    except Exception as e:
        pass

## Network has no edges only because a every small subset of the data was parsed

In [None]:
graph_draw(network, vertex_text=network.vertex_properties["twitter_screen_name"],
           vertex_font_size=40,
           output_size=(10000, 10000), output="sehir_subset.png")