In [1]:
from fuzzywuzzy import process
import pandas as pd, editdistance, numpy as np

In [17]:
def get_subname(fullname, index=0, lower=True):
    """ Takes fullname as input
        returns name at specific index
        e.g.
        fullname="Ammar Adel Rashed"
        index=1
        return 'adel'
    """
    try:
        name = fullname.strip().split()[index]
    except IndexError:
        name = ""
    return name.lower() if lower else name

In [6]:
def get_names_dist(fullname, twitter_name):
    reference_names = fullname.strip().lower().split()
    twitter_names = twitter_name.strip().lower().split()
    return int(editdistance.eval(reference_names, twitter_names))

###  preparing mock data

In [18]:
df = pd.read_csv("ns.csv")
cols = df.columns[0:2]
df = df[cols]
ren = {cols[0]:"fullname", cols[1]:"email"}
df = df.rename(columns=ren)
df.head(5)

Unnamed: 0,fullname,email
0,Aaditya Vijay,aadityavijay@std.sehir.edu.tr
1,Abbas Furkan Cangir,abbascangir@std.sehir.edu.tr
2,Abbas Kutay,abbaskutay@std.sehir.edu.tr
3,Abbas Samadi,abbassamadi@std.sehir.edu.tr
4,Abbass El Abbass,abbasselabbass@std.sehir.edu.tr


### Clearing out invalid rows (e.g. non-string names)

In [25]:
filtered_df = df["name_length"]=df["fullname"].apply(lambda x: len(x.strip().split()) if type(x) == str else -1)
filtered_df = df.drop(df[df['name_length'] < 0].index)
filtered_df.head(5)

Unnamed: 0,fullname,email,name_length
0,Aaditya Vijay,aadityavijay@std.sehir.edu.tr,2
1,Abbas Furkan Cangir,abbascangir@std.sehir.edu.tr,3
2,Abbas Kutay,abbaskutay@std.sehir.edu.tr,2
3,Abbas Samadi,abbassamadi@std.sehir.edu.tr,2
4,Abbass El Abbass,abbasselabbass@std.sehir.edu.tr,3


### Trying out editdistance

In [44]:
scored_df = filtered_df.copy()
scored_df["names_dist"] = scored_df[["fullname", "email"]].apply(lambda x: get_names_dist(x[0], x[1]), axis=1)
scored_df.head(5)

Unnamed: 0,fullname,email,name_length,names_dist
0,Aaditya Vijay,aadityavijay@std.sehir.edu.tr,2,2
1,Abbas Furkan Cangir,abbascangir@std.sehir.edu.tr,3,3
2,Abbas Kutay,abbaskutay@std.sehir.edu.tr,2,2
3,Abbas Samadi,abbassamadi@std.sehir.edu.tr,2,2
4,Abbass El Abbass,abbasselabbass@std.sehir.edu.tr,3,3


In [None]:
df