In [1]:
import pandas as pd
import numpy as np
import os
os.chdir('/home/ankushraut/Downloads/assignment')
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('Deduplication Problem - Sample Dataset.csv')

In [3]:
data.head()

Unnamed: 0,ln,dob,gn,fn,is_duplicate
0,SMITH JR,01/03/68,F,WILLIAM,0
1,ROTHMEYER JR,01/03/68,F,WILLIAM,0
2,ASBY JR,01/03/68,F,WILLIAM,0
3,SALTER JR,01/03/68,F,WILLIAM,0
4,SALTER JR,01/03/68,F,WILLIAM,1


In [4]:
data.dob = pd.to_datetime(data.dob)

In [5]:
data.head()

Unnamed: 0,ln,dob,gn,fn,is_duplicate
0,SMITH JR,1968-01-03,F,WILLIAM,0
1,ROTHMEYER JR,1968-01-03,F,WILLIAM,0
2,ASBY JR,1968-01-03,F,WILLIAM,0
3,SALTER JR,1968-01-03,F,WILLIAM,0
4,SALTER JR,1968-01-03,F,WILLIAM,1


In [6]:
data.dob.head(10)

0   1968-01-03
1   1968-01-03
2   1968-01-03
3   1968-01-03
4   1968-01-03
5   2062-02-21
6   2062-02-21
7   2062-02-21
8   2062-02-21
9   2062-02-21
Name: dob, dtype: datetime64[ns]

In [7]:
data['name'] = data.fn + ' ' + data.ln

In [8]:
data.head()

Unnamed: 0,ln,dob,gn,fn,is_duplicate,name
0,SMITH JR,1968-01-03,F,WILLIAM,0,WILLIAM SMITH JR
1,ROTHMEYER JR,1968-01-03,F,WILLIAM,0,WILLIAM ROTHMEYER JR
2,ASBY JR,1968-01-03,F,WILLIAM,0,WILLIAM ASBY JR
3,SALTER JR,1968-01-03,F,WILLIAM,0,WILLIAM SALTER JR
4,SALTER JR,1968-01-03,F,WILLIAM,1,WILLIAM SALTER JR


In [9]:
unique_dob = data.dob.unique()
unique_sex = data.gn.unique()

In [10]:
import distance

In [11]:
def deduplication_model(data, scoring_range = 10, step = 2):
    data['indices'] = list(range(len(data)))
    accuracy = []
    index = []
    final_step = 0
    for value in range(scoring_range):
        for i in unique_dob:
            for j in unique_sex:
                sample = data[(data.dob == i)][(data.gn == j)].reset_index(drop = True)
                sample1 = sample[(sample.is_duplicate == 1)].reset_index(drop = True)
                comparison = sample[(sample.is_duplicate == 0)].reset_index(drop = True)
                for a in range(len(sample1)):
                    scores = [distance.levenshtein(sample1.name[a], comparison.name[x]) for x in range(len(comparison))]
                    try:
                        score = np.min(scores)
                    except ValueError:  #raised if `y` is empty.
                        pass
                    if score<=value:
                        index.append(sample1.indices[a])
        prediction = []
        for k in range(len(data)):
            if data.indices[k] in index:
                prediction.append(1)
            else:
                prediction.append(0)

        data['prediction'] = prediction
        print('Accuracy after ',value, 'iterations : ', accuracy_score(data.is_duplicate, data.prediction))
        accuracy.append(accuracy_score(data.is_duplicate, data.prediction))
        if len(accuracy)>1 and accuracy[-1] <= accuracy[-2]:
            final_step+=1
        if final_step>=step:    
            value = value-1
            break
    
    index = []
    for i in unique_dob:
        for j in unique_sex:
            sample = data[(data.dob == i)][(data.gn == j)].reset_index(drop = True)
            sample1 = sample[(sample.is_duplicate == 1)].reset_index(drop = True)
            comparison = sample[(sample.is_duplicate == 0)].reset_index(drop = True)
            for a in range(len(sample1)):
                scores = [distance.levenshtein(sample1.name[a], comparison.name[x]) for x in range(len(comparison))]
                try:
                    score = np.min(scores)
                except ValueError:  #raised if `y` is empty.
                    pass
                if score<=value:
                    index.append(sample1.indices[a])
    prediction = []
    for k in range(len(data)):
        if data.indices[k] in index:
            prediction.append(1)
        else:
            prediction.append(0)

    data['prediction'] = prediction
    return prediction, value

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.1, stratify = data.is_duplicate, random_state = 0)
train = train.reset_index(drop = True)
test = test.reset_index(drop = True)
performance, levenshtein_value_optimum = deduplication_model(train, scoring_range = 10, step = 2)

  if __name__ == '__main__':


Accuracy after  0 iterations :  0.858695652174
Accuracy after  1 iterations :  0.891304347826
Accuracy after  2 iterations :  0.923913043478
Accuracy after  3 iterations :  0.989130434783
Accuracy after  4 iterations :  1.0
Accuracy after  5 iterations :  1.0
Accuracy after  6 iterations :  1.0




In [13]:
def deduplication_prediction(data, optimum_value):
    data['indices'] = list(range(len(data)))
    index = []
    for i in unique_dob:
            for j in unique_sex:
                sample = data[(data.dob == i)][(data.gn == j)].reset_index(drop = True)
                sample1 = sample[(sample.is_duplicate == 1)].reset_index(drop = True)
                comparison = sample[(sample.is_duplicate == 0)].reset_index(drop = True)
                for a in range(len(sample1)):
                    scores = [distance.levenshtein(sample1.name[a], comparison.name[x]) for x in range(len(comparison))]
                    score = 0
                    try:
                        score = np.min(scores)
                    except ValueError:  #raised if `y` is empty.
                        pass
                    if score<=optimum_value:
                        index.append(sample1.indices[a])
            prediction = []
            for k in range(len(data)):
                if data.indices[k] in index:
                    prediction.append(1)
                else:
                    prediction.append(0)
    return prediction

In [14]:
predictions = deduplication_prediction(test, levenshtein_value_optimum)

  


In [15]:
print('Accuracy on test set:',accuracy_score(test.is_duplicate, predictions))

Accuracy on test set: 1.0


In [16]:
train['prediction'] = performance
test['prediction'] = predictions
dataset = pd.concat([train, test], axis = 0)
dataset = dataset[(dataset.prediction != 1)].reset_index(drop = True).drop(labels = ['name', 'is_duplicate', 'prediction', 'indices'], axis = 1)

In [17]:
dataset.to_csv('Deduplicated_dataset.csv', index = False)