In [1]:
import psycopg2

import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from copy import deepcopy

import re, unicodedata

# from joblib import Parallel,
delayed
# from graph_tool.all import *

### parse sehir contacts directory (full names, emails)

In [2]:
sehir_directory = pd.read_csv('datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email']).dropna()

In [3]:
sehir_directory.head()

Unnamed: 0,First Name,Last Name,Primary Email
102,Burak,Fidan,burakfidan@std.sehir.edu.tr
103,Muhammed,Aydin,cihadaydin@std.sehir.edu.tr
104,Muhammet Masuk,Aktas,muhammetaktas@std.sehir.edu.tr
105,Sezer,Aktas,sezeraktas@std.sehir.edu.tr
106,Ayse Nur,Salk,aysesalk@std.sehir.edu.tr


In [4]:
"{} sehir contacts".format(len(sehir_directory))

'9088 sehir contacts'

In [5]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [28]:
def get_matches_edit_distance(item, choices, limit, scorer=fuzz.token_sort_ratio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=2):
    global fullnames,twitter_users_count
    twitter_user_by_screen_name = twitter_users.set_index('twitter_screen_name')
    
    for twitter_screen_name in twitter_users['twitter_screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[twitter_screen_name]['cleaned_twitter_name']
        sehir_matches = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
        if counter %100 == 0:
            print(counter, "out of ", twitter_users_count)
        yield (twitter_screen_name, sehir_matches)
        
def filter_matches_by_threshold(matches_dict, threshold=70):
    filtered_dict = dict()
    for twitter_screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[twitter_screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold, limit):
    matches = {screen_name : sehir_matches for screen_name, sehir_matches in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'twitter_screen_name': list(screen_names),
                         'sehir_matches': [filtered_matches[screen_name] for screen_name in screen_names]})

def clean(name):
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")

### connect to sql database

In [20]:
connection = psycopg2.connect('dbname=twitter_accounts_new host=localhost user=postgres password=1_sehir_1')

twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)\
.rename(columns={'id': 'GUID', 
                 'name': 'twitter_name',
                 'description': 'profile_description',
                 'screen_name': 'twitter_screen_name'})

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [21]:
twitter_users["cleaned_twitter_name"] = twitter_users["twitter_name"].apply(lambda x:clean(x) if len(clean(x))>4 else np.NaN)
twitter_users.dropna(inplace=True)
twitter_users.head(5)

Unnamed: 0,GUID,twitter_name,twitter_screen_name,profile_description,followers_count,friends_count,favourites_count,statuses_count,lang,cleaned_twitter_name
0,2924769794,Elif Öztürk,bidikshowtime,"Çanakkale Fen Lisesi'nde okumuş, Boğaziçi Üniv...",46,146,1,2,en,elif ozturk
2,78362287,Ozan Kublay,ozankublay,,307,892,1093,69,en,ozan kublay
3,927559556,Dilay Kirişçi,annebanafalbak,,119,146,34,307,tr,dilay kirisci
4,567346178,Aslı Atay,asliatayy,,156,523,69,1687,tr,asl atay
5,94153120,Özgecan,theewayfarer,Student athlete and research assistant,136,472,1097,1513,en,ozgecan


In [27]:
twitter_users_count = len(twitter_users)
twitter_users_count

34142

In [23]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

### user_connections contains follower/followee relationships by GUID

In [24]:
user_connections.head(2)

Unnamed: 0,from_user_id,to_user_id
0,2924769794,566810673
1,2924769794,78362287


### matching subset of sehir twitter accounts _ALL_

In [25]:
sehir_matches_df = get_matches_dataframe(twitter_users,
                                         sehir_directory, threshold=80, limit=2)



In [34]:
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.head(5)

There are 4063 matches


Unnamed: 0,sehir_matches,twitter_screen_name
0,"[(mustafa demir, 100), (mustafa gokdemir, 90)]",mustafedemir
1,"[(melisa yalin, 83)]",selambenmelissa
2,"[(mustafa durdu, 81), (mustafa duman, 81)]",dundar_mstf
3,"[(ahmet karanfil, 81)]",AhmetMaranki
4,"[(derya genc, 82)]",bayan_sukela


In [35]:
merged = sehir_matches_df.merge(twitter_users, left_on='twitter_screen_name', right_on='twitter_screen_name')

In [36]:
merged[['GUID', 'twitter_screen_name', 'cleaned_twitter_name', 'sehir_matches']].tail(5)

Unnamed: 0,GUID,twitter_screen_name,cleaned_twitter_name,sehir_matches
4058,3855028132,busra1i,busra zengin,"[(busra ozzengin, 92), (busra sezgin, 83)]"
4059,1029280640,kbuluts,suleyman karabulut,"[(uluhan karabulut, 82), (suleyman karatas, 82)]"
4060,477198346,merveaktas92,merve aktas,"[(merve altas, 91), (merve akkus, 82)]"
4061,1479485390,yagmurdan_once,sevim erdogan,"[(sevde erdogan, 85)]"
4062,1180515096,demirb0,furkan demir,"[(furkan demir, 100), (furkan emer, 87)]"


In [38]:
merged.to_csv('datasets/sehir_matches.csv', index_label="GUID")

In [39]:
merged = pd.read_csv('datasets/sehir_matches.csv', index_col="GUID")

In [40]:
merged.head(5)

Unnamed: 0_level_0,sehir_matches,twitter_screen_name,GUID.1,twitter_name,profile_description,followers_count,friends_count,favourites_count,statuses_count,lang,cleaned_twitter_name
GUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,"[('mustafa demir', 100), ('mustafa gokdemir', ...",mustafedemir,294557248,Mustafa DEMİR,BEDEN EĞİTİMİ ÖĞRETMENİ \nAli Akkanat Anadolu ...,45,53,349,261,tr,mustafa demir
1,"[('melisa yalin', 83)]",selambenmelissa,3410655087,Melissa Yanıcı,gizlilikten dolayı fotoğraf yayınlamıyorum,115870,514,117,495,tr,melissa yanc
2,"[('mustafa durdu', 81), ('mustafa duman', 81)]",dundar_mstf,2996348915,Mustafa Dündar,,869,306,3669,302,tr,mustafa dundar
3,"[('ahmet karanfil', 81)]",AhmetMaranki,89203798,Ahmet MARANKİ,Prof. Dr. Ahmet Maranki'nin çalışmalarının yay...,4762,28,0,0,tr,ahmet maranki
4,"[('derya genc', 82)]",bayan_sukela,239841117,Derya E.,"Büyük adam olmana lüzum yok, sadece adam ol ye...",567,70,1,2615,tr,derya e.


In [42]:
# merged.set_index('GUID.1').loc[291122559]

In [43]:
'{} twitter accounts have been matched successfully'.format(len(merged))

'4063 twitter accounts have been matched successfully'

In [None]:
# sehir_matches_by_guid = merged.set_index('GUID')

In [None]:
number_of_matches, _ = merged.shape

network = Graph(directed=True)
network.add_vertex(number_of_matches)

vprop = network.new_vertex_property("string")

for vertex, twitter_screen_name in zip(network.vertices(), merged['twitter_screen_name'].values):
    vprop[vertex] = twitter_screen_name
    
network.vertex_properties["twitter_screen_name"] = vprop

for i, row in user_connections.iterrows():
    from_ = row['from_user_id']
    to = row['to_user_id']
    
    try:
        network.add_edge(find_vertex(network, vprop, sehir_matches_by_guid.loc[from_]['twitter_name']), 
                         find_vertex(network, vprop, sehir_matches_by_guid.loc[to]['twitter_name']))
    except Exception as e:
        pass

In [None]:
graph_draw(network, vertex_text=network.vertex_properties["twitter_screen_name"],
           vertex_font_size=40,
           output_size=(10000, 10000), output="sehir_subset.png")