In [1]:
from SehirParser import *

import psycopg2

import ast

### parse sehir contacts directory (full names, emails)

In [2]:
sehir_directory = pd.read_csv('datasets/contacts.csv', 
                               encoding = "ISO-8859-1", 
                               usecols=['First Name', 'Last Name', 'Primary Email'])
sehir_directory.replace(np.nan, '', regex=True, inplace=True)

In [3]:
sehir_directory.tail()

Unnamed: 0,First Name,Last Name,Primary Email
10552,Sukru,Olkun,sukruolkun@std.sehir.edu.tr
10553,Zeynep Begum,Tanis,zeyneptanis@std.sehir.edu.tr
10554,Anastasiya,Osipova,anastasiyaosipova@std.sehir.edu.tr
10555,Omer,Koca,omerkoca@std.sehir.edu.tr
10556,Muhammed Murat,Yilmaz,muratyilmaz@std.sehir.edu.tr


In [4]:
"{} sehir contacts".format(len(sehir_directory))

'10557 sehir contacts'

In [5]:
# twitter = SehirParser('datasets/contacts.csv')
# twitter.connect_db("twitter_accounts_new","localhost", "postgres", "1_sehir_1")

In [6]:
# tw_df, sehir_matches_tw_df = twitter.get_sehir_matches_df()

In [7]:
fullnames = [' '.join(first_last_name).lower() 
                 for first_last_name in sehir_directory[['First Name', 'Last Name']].values]

In [8]:
import time

In [9]:
def get_matches_edit_distance(item, choices, limit, scorer=fuzz.token_sort_ratio):
    return process.extract(item, choices, limit=limit, scorer=scorer)
counter = 0
def get_sehir_twitter_matches(twitter_users, sehir_directory, limit=2):
    global fullnames, counter
    twitter_user_by_screen_name = twitter_users.set_index('twitter_screen_name')
    start = time.time()
    for twitter_screen_name in twitter_users['twitter_screen_name']:
        twitter_name = twitter_user_by_screen_name.loc[twitter_screen_name]['cleaned_twitter_name']
        sehir_matches = get_matches_edit_distance(twitter_name, fullnames, limit)
        counter += 1
        if counter %100 == 0:
            print(counter, "out of ", len(twitter_users))
            start_ = time.time()
            print(start_-start, "seconds")
            start = start_
        yield (twitter_screen_name, sehir_matches)
        
def filter_matches_by_threshold(matches_dict, threshold=85):
    filtered_dict = dict()
    for twitter_screen_name, matches in matches_dict.items():
        filtered = [(match, score) for match, score in matches if score > threshold]
        
        if filtered:
            filtered_dict[twitter_screen_name] = filtered
        
    return filtered_dict

def get_matches_dataframe(twitter_users, sehir_directory, threshold=85, limit=2):
    matches = {screen_name : sehir_matches for screen_name, sehir_matches in 
               get_sehir_twitter_matches(twitter_users, sehir_directory, limit=limit)}
    
    filtered_matches = filter_matches_by_threshold(matches, threshold=threshold)
    screen_names = filtered_matches.keys()
    return pd.DataFrame({'twitter_screen_name': list(screen_names),
                         'sehir_matches': [filtered_matches[screen_name] for screen_name in screen_names]})

def clean(name):
    return unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').lower().decode("ascii")

# connect to sql database

In [10]:
connection = psycopg2.connect('dbname=twitter_accounts_new host=localhost user=postgres password=1_sehir_1')

tw = pd.read_sql("SELECT * FROM twitter_user", connection)\
.rename(columns={'id': 'GUID', 
                 'name': 'twitter_name',
                 'description': 'profile_description',
                 'screen_name': 'twitter_screen_name'})

user_connections = pd.read_sql("SELECT * FROM twitter_connection", connection).drop('id', axis=1)

In [11]:
tw["cleaned_twitter_name"] = tw["twitter_name"].apply(lambda x:clean(x) if len(clean(x))>4 else np.NaN)
tw.dropna(inplace=True)
tw.head(5)

Unnamed: 0,GUID,twitter_name,twitter_screen_name,profile_description,followers_count,friends_count,favourites_count,statuses_count,lang,cleaned_twitter_name
0,2924769794,Elif Öztürk,bidikshowtime,"Çanakkale Fen Lisesi'nde okumuş, Boğaziçi Üniv...",46,146,1,2,en,elif ozturk
2,78362287,Ozan Kublay,ozankublay,,307,892,1093,69,en,ozan kublay
3,927559556,Dilay Kirişçi,annebanafalbak,,119,146,34,307,tr,dilay kirisci
4,567346178,Aslı Atay,asliatayy,,156,523,69,1687,tr,asl atay
5,94153120,Özgecan,theewayfarer,Student athlete and research assistant,136,472,1097,1513,en,ozgecan


In [12]:
tw.to_csv("datasets/tw_users.csv", index_label="GUID")

In [13]:
twitter_users_count = len(tw)
twitter_users_count

34142

In [14]:
user_connections.head(2)

Unnamed: 0,from_user_id,to_user_id
0,2924769794,566810673
1,2924769794,78362287


In [15]:
sehir_matches = Parallel(n_jobs=-1)(delayed(get_matches_dataframe)(
    tw[int(i*(len(tw)/8)):int((i+1)*(len(tw)/8))],
    sehir_directory) for i in range(8))



100 out of  4267
17.265946865081787 seconds
100 out of  4268
17.301717519760132 seconds
100 out of  4268
17.687835693359375 seconds
100 out of  4268
17.788217067718506 seconds
100 out of  4268
17.886919736862183 seconds
100 out of  4268
17.969120264053345 seconds
100 out of  4267
18.094061613082886 seconds
100 out of  4268
18.453871488571167 seconds
200 out of  4268
18.209047555923462 seconds
200 out of  4267
18.291517972946167 seconds
200 out of  4268
17.86551523208618 seconds
200 out of  4268
17.60398292541504 seconds
200 out of  4267
17.83299684524536 seconds
200 out of  4268
18.08817982673645 seconds
200 out of  4268
18.18011999130249 seconds
200 out of  4268
18.42969274520874 seconds




300 out of  4268
17.435218572616577 seconds
300 out of  4268
17.3332200050354 seconds
300 out of  4268
17.957885026931763 seconds
300 out of  4268
17.987644910812378 seconds
300 out of  4267
300 out of  4268
18.393767833709717 seconds
18.551138877868652 seconds
300 out of  4267
19.024789333343506 seconds
300 out of  4268
17.950047731399536 seconds
400 out of  4268
18.571648836135864 seconds
400 out of  4268
18.314082860946655 seconds
400 out of  4268
18.861629009246826 seconds
400 out of  4267
18.255645513534546 seconds
400 out of  4268
18.88218927383423 seconds
400 out of  4268
18.617494583129883 seconds
400 out of  4267
19.321255445480347 seconds
400 out of  4268
19.210911512374878 seconds




500 out of  4268
18.07310390472412 seconds
500 out of  4268
18.092943906784058 seconds
500 out of  4268
18.51248574256897 seconds
500 out of  4267
18.345088720321655 seconds
500 out of  4268
18.469186782836914 seconds
500 out of  4267
17.765822649002075 seconds
500 out of  4268
18.511550903320312 seconds
500 out of  4268
18.43035387992859 seconds
600 out of  4268
17.953754663467407 seconds
600 out of  4268
17.98629641532898 seconds
600 out of  4268
18.32557487487793 seconds
600 out of  4267
18.392552137374878 seconds
600 out of  4268
18.428070068359375 seconds
600 out of  4268
18.307281017303467 seconds
600 out of  4267
18.39795422554016 seconds
600 out of  4268
18.273663759231567 seconds




700 out of  4268
18.142348527908325 seconds
700 out of  4268
18.54949188232422 seconds
700 out of  4268
18.268422603607178 seconds
700 out of  4267
17.789704084396362 seconds
700 out of  4268
17.82510018348694 seconds
700 out of  4268
18.078860998153687 seconds
700 out of  4267
18.71204400062561 seconds
700 out of  4268
18.28415846824646 seconds
800 out of  4268
17.568140506744385 seconds
800 out of  4268
17.882240056991577 seconds
800 out of  4268
18.398870944976807 seconds
800 out of  4268
17.835928916931152 seconds
800 out of  4267
18.48406171798706 seconds
800 out of  4268
18.141968488693237 seconds
800 out of  4267
18.17204737663269 seconds
800 out of  4268
18.413434743881226 seconds
900 out of  4268
17.792025327682495 seconds
900 out of  4268
17.68198585510254 seconds
900 out of  4268
18.223508596420288 seconds
900 out of  4268
17.739572286605835 seconds
900 out of  4268
18.036080360412598 seconds
900 out of  4267
18.03506898880005 seconds
900 out of  4267
17.484588384628296 seco



1300 out of  4268
18.253259897232056 seconds
1400 out of  4268
18.004423141479492 seconds
1400 out of  4268
18.07024383544922 seconds
1400 out of  4267
17.913684129714966 seconds
1400 out of  4268
18.352844953536987 seconds
1400 out of  4268
18.050633430480957 seconds
1400 out of  4268
18.344842195510864 seconds
1400 out of  4267
19.112195253372192 seconds
1400 out of  4268
19.15216636657715 seconds




1500 out of  4268
18.842026948928833 seconds
1500 out of  4268
18.936829090118408 seconds
1500 out of  4267
18.87243342399597 seconds
1500 out of  4268
18.804137706756592 seconds
1500 out of  4268
20.13193392753601 seconds
1500 out of  4268
19.241639375686646 seconds
1500 out of  4267
19.30998158454895 seconds
1500 out of  4268
19.587082862854004 seconds
1600 out of  4268
18.203056573867798 seconds
1600 out of  4268
17.650569438934326 seconds
1600 out of  4267
18.081778287887573 seconds
1600 out of  4268
18.322479248046875 seconds
1600 out of  4268
17.63896369934082 seconds
1600 out of  4268
19.11797070503235 seconds
1600 out of  4267
18.26419734954834 seconds
1600 out of  4268
18.571415662765503 seconds
1700 out of  4268
17.566684007644653 seconds
1700 out of  4268
18.207682132720947 seconds
1700 out of  4267
17.573900938034058 seconds
1700 out of  4268
17.86687421798706 seconds
1700 out of  4268
18.06523895263672 seconds
1700 out of  4268
17.586085319519043 seconds
1700 out of  4267




2300 out of  4268
18.32698655128479 seconds
2300 out of  4268
18.382631540298462 seconds
2300 out of  4267
17.386085987091064 seconds
2300 out of  4268
18.288203239440918 seconds
2300 out of  4268
17.78687047958374 seconds
2300 out of  4267
17.27074956893921 seconds
2300 out of  4268
18.456771850585938 seconds
2400 out of  4268
18.136616468429565 seconds
2400 out of  4268
18.088878870010376 seconds
2400 out of  4268
17.853448152542114 seconds
2400 out of  4267
18.44068956375122 seconds
2400 out of  4268
18.330170392990112 seconds
2400 out of  4268
18.208729028701782 seconds
2400 out of  4267
19.19256591796875 seconds
2400 out of  4268
19.012084245681763 seconds
2500 out of  4268
18.150882244110107 seconds
2500 out of  4268
17.836405277252197 seconds
2500 out of  4268
17.7088725566864 seconds
2500 out of  4267
17.32562017440796 seconds
2500 out of  4268
18.418993711471558 seconds
2500 out of  4268
17.490052700042725 seconds
2500 out of  4267
18.49817132949829 seconds
2500 out of  4268
1



3200 out of  4268
18.44262409210205 seconds
3300 out of  4268
19.241634368896484 seconds
3300 out of  4268
18.66929316520691 seconds
3300 out of  4268
18.112233638763428 seconds
3300 out of  4267
18.912370204925537 seconds
3300 out of  4268
18.55874800682068 seconds
3300 out of  4268
19.056516408920288 seconds
3300 out of  4267
18.884244918823242 seconds
3300 out of  4268
17.838406801223755 seconds
3400 out of  4268
18.189141273498535 seconds
3400 out of  4268
17.605539321899414 seconds
3400 out of  4268
17.650384426116943 seconds
3400 out of  4267
18.150221347808838 seconds
3400 out of  4268
17.613330841064453 seconds
3400 out of  4268
18.0676691532135 seconds
3400 out of  4267
17.581520080566406 seconds




3500 out of  4268
17.388402700424194 seconds
3400 out of  4268
18.472131729125977 seconds
3500 out of  4268
17.46627140045166 seconds
3500 out of  4268
18.20729422569275 seconds
3500 out of  4267
17.694626808166504 seconds
3500 out of  4268
17.766173601150513 seconds
3500 out of  4268
18.1105535030365 seconds
3500 out of  4267
17.462971687316895 seconds




3600 out of  4268
18.428831815719604 seconds
3500 out of  4268
18.082009077072144 seconds
3600 out of  4268
17.94762134552002 seconds
3600 out of  4268
18.87918448448181 seconds
3600 out of  4267
18.25723099708557 seconds
3600 out of  4268
18.06779670715332 seconds
3600 out of  4268
19.13007879257202 seconds
3600 out of  4267
19.247209787368774 seconds
3700 out of  4268
18.237897396087646 seconds
3600 out of  4268
19.00666904449463 seconds
3700 out of  4268
18.14180612564087 seconds
3700 out of  4268
17.779071807861328 seconds
3700 out of  4267
17.879739999771118 seconds
3700 out of  4268
18.183664560317993 seconds
3700 out of  4268
17.675490140914917 seconds
3700 out of  4267
17.962334632873535 seconds
3800 out of  4268
17.331755876541138 seconds
3700 out of  4268
18.59624218940735 seconds
3800 out of  4268
17.437517881393433 seconds
3800 out of  4268
17.481491327285767 seconds
3800 out of  4267
17.794371604919434 seconds
3800 out of  4268
17.646355390548706 seconds
3800 out of  4268




4000 out of  4268
17.43497061729431 seconds
3900 out of  4268
18.20366907119751 seconds
4000 out of  4268
17.7413432598114 seconds
4000 out of  4267
17.28535270690918 seconds
4000 out of  4268
17.603271007537842 seconds
4000 out of  4268
17.664872884750366 seconds
4000 out of  4267
18.417967319488525 seconds
4100 out of  4268
17.327133417129517 seconds
4100 out of  4268
17.465606212615967 seconds
4000 out of  4268
17.79119849205017 seconds
4100 out of  4268
18.178386449813843 seconds
4100 out of  4267
18.186551332473755 seconds
4100 out of  4268
17.45860266685486 seconds
4100 out of  4268
18.291747331619263 seconds
4100 out of  4267
18.45374870300293 seconds
4200 out of  4268
17.556281566619873 seconds
4200 out of  4268
18.342085599899292 seconds
4200 out of  4268
18.686500549316406 seconds
4100 out of  4268
22.04403042793274 seconds
4200 out of  4267
18.529147148132324 seconds
4200 out of  4268
18.982117891311646 seconds
4200 out of  4268
18.521490335464478 seconds
4200 out of  4267
1

In [None]:
# cols = {"id":"GUID", "name":"cleaned_twitter_name", "screen_name":"twitter_screen_name"}
# tw_sp = SehirParser('datasets/contacts.csv', "datasets/tw_users.csv", cols)

In [18]:
sehir_matches_df = pd.concat(sehir_matches)
sehir_matches_df.index = range(len(sehir_matches_df))
print("There are {} matches".format(len(sehir_matches_df)))
sehir_matches_df.head(5)

There are 2276 matches


Unnamed: 0,sehir_matches,twitter_screen_name
0,"[(cem kurtulus, 92)]",cemkurtulmus
1,"[(caner ozdemir, 92)]",AMASYABELBASKAN
2,"[(ebru gunes, 95)]",EbruGundes
3,"[(sumeyye zengin, 100)]",summeye
4,"[(ahmet kaya, 100), (ahmet akkaya, 91)]",ahmet_kayaa44


In [20]:
merged = sehir_matches_df.merge(tw, left_on='twitter_screen_name', right_on='twitter_screen_name')

In [21]:
merged[['GUID', 'twitter_screen_name', 'cleaned_twitter_name', 'sehir_matches']].tail(5)

Unnamed: 0,GUID,twitter_screen_name,cleaned_twitter_name,sehir_matches
2271,2313328404,zincirsiziz,kadir sahin,"[(kadir san, 90)]"
2272,2177009786,furkyasar,furkan yasar,"[(furkan yasar, 100)]"
2273,3179074006,ahmed80885,ahmed osman,"[(ahmed san, 90)]"
2274,932666414636453889,Ahmetaltanegeli,ahmet altan,"[(ahmet aslan, 91), ( ahmet tan, 90)]"
2275,324378478,KahramanGulcin,gulcin kahraman,"[(gulcin kayhan, 86)]"


In [22]:
merged.to_csv('datasets/sehir_matches.csv', index_label="GUID")

In [None]:
# merged.set_index('GUID.1').loc[291122559]

In [None]:
'{} twitter accounts have been matched successfully'.format(len(merged))

In [None]:
# sehir_matches_by_guid = merged.set_index('GUID')

In [None]:
number_of_matches, _ = merged.shape

network = Graph(directed=True)
network.add_vertex(number_of_matches)

vprop = network.new_vertex_property("string")

for vertex, twitter_screen_name in zip(network.vertices(), merged['twitter_screen_name'].values):
    vprop[vertex] = twitter_screen_name
    
network.vertex_properties["twitter_screen_name"] = vprop

for i, row in user_connections.iterrows():
    from_ = row['from_user_id']
    to = row['to_user_id']
    
    try:
        network.add_edge(find_vertex(network, vprop, sehir_matches_by_guid.loc[from_]['twitter_name']), 
                         find_vertex(network, vprop, sehir_matches_by_guid.loc[to]['twitter_name']))
    except Exception as e:
        pass

In [None]:
graph_draw(network, vertex_text=network.vertex_properties["twitter_screen_name"],
           vertex_font_size=40,
           output_size=(10000, 10000), output="sehir_subset.png")