In [29]:
import psycopg2

import pandas as pd
import numpy as np

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from copy import deepcopy

### parse sehir contacts directory (full names, emails)

In [2]:
contacts = pd.read_csv('datasets/contacts.csv', encoding = "ISO-8859-1", usecols=['First Name', 'Last Name', 'Primary Email'])
contacts_no_nans = contacts.dropna()

fullnames = [' '.join(c) for c in contacts_no_nans[['First Name', 'Last Name']].values]

In [72]:
def get_matches_edit_distance(item, choices, limit):
    return process.extract(item, choices, limit=limit, scorer=fuzz.token_sort_ratio)

def match_twitter_handles_to_names(twitter_users, sehir_student_names, twitter_handle_column='screen_name', limit=2):
    twitter_user_by_screen_name = twitter_users.set_index(twitter_handle_column)
    for screen_name in twitter_users[twitter_handle_column]:
        name = twitter_user_by_screen_name.loc[screen_name, 'name']
        yield screen_name, get_matches_edit_distance(name, sehir_student_names, limit)
        
def filter_matches_by_threshold(matches_dict, threshold=70):
    filtered_dict = dict()
    for key, matches in matches_dict.items():
        filtered = list(filter(lambda match_score: match_score[1] > threshold, matches))
        
        if filtered:
            filtered_dict[key] = filtered
        
    return filtered_dict

### connect to sql database

In [68]:
connection = psycopg2.connect("dbname=twitter_accounts user=postgres")
twitter_users = pd.read_sql("SELECT * FROM twitter_user", connection)
user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [69]:
twitter_users.head(2)

Unnamed: 0,id,name,screen_name,description,followers_count,friends_count,favourites_count,statuses_count,lang
0,106086098,ŞEHİR Üniversitesi,SehirUniversite,İstanbul Şehir Üniversitesi'nin Resmi Twitter ...,11367,165,387,3346,tr
1,813478272910630913,SehirCHES,SehirCHES,Official Twitter for @SehirUniversite 's Cente...,102,201,27,44,tr


### user_connections contains follower/followee relationships by GUID

In [70]:
user_connections.head(2)

Unnamed: 0,from_user_id,to_user_id
0,106086098,813478272910630913
1,106086098,835028362032742400


In [86]:
matches_dict, match_first = dict(), 100
gen = match_twitter_handles_to_names(twitter_users, fullnames, limit=2)

for i in range(match_first):
    screen_name, match = next(gen)
    matches_dict[screen_name] = match

filtered = filter_matches_by_threshold(matches_dict, 80)

In [87]:
filtered

{'Imanov_Resadet': [('Resadet Imanov', 100)],
 'MustafaOzelden': [('Mustafa Ozel', 96), ('Mustafa Pekel', 83)],
 'SehirCycling': [('Sehir Cycling Club', 94)],
 'SehirGirisim': [('Girisimcilik Kulubu', 89)],
 '_apollo_7': [('Abdullah Kaplan', 83)],
 'aatcil': [('Abdurrahman Patur', 85), ('Abdurrahman Altundal', 83)],
 'ak1gokhan': [('Gokhan Ak', 94)],
 'haticeaynurr': [('Hatice Aygen', 83), ('Hatice Aygor', 83)],
 'malibuyukkara': [('Mehmet Ali Kahraman', 81)],
 'malpergumus': [('Mustafa Alper Eroglu', 81)],
 'meneserkan': [('Enes Erkan', 100), ('Serkan Senel', 82)],
 'mfatihaysan': [('Fatih Mehmet Arslan', 92), ('Fatih Mehmet Agan', 91)],
 'nurpuytu': [('Fatmanur Puytu', 100)],
 'sehiricebreaker': [('Icebreakers PR', 84), ('Sehir Icebreakers Club', 82)],
 'ycbenli': [('Cengiz Benli', 100)]}