In [2]:
import re
import pandas as pd
import numpy as np
from collections import Counter
import string
from string import digits
import math
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from scipy import spatial
from numpy import dot
import matplotlib
import matplotlib.pyplot as plt
from numpy.linalg import norm
from sklearn.metrics import confusion_matrix
import jellyfish
import seaborn as sns
from collections import Counter
from metaphone import doublemetaphone
import enchant
import glob
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload 
%autoreload 2

In [3]:
# Reading the OFAC List
OFAC_list = pd.read_csv('https://www.treasury.gov/ofac/downloads/sdn.csv', header = None) # OFAC always uses this precise link
OFAC_list

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,36,AEROCARIBBEAN AIRLINES,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
1,173,"ANGLO-CARIBBEAN CO., LTD.",-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
2,306,BANCO NACIONAL DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,a.k.a. 'BNC'.
3,424,BOUTIQUE LA MAISON,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
4,475,CASA DE CUBA,-0-,CUBA,-0-,-0-,-0-,-0-,-0-,-0-,-0-,-0-
...,...,...,...,...,...,...,...,...,...,...,...,...
10151,37184,"KARDIAN, Ari",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 16 Feb 1990; POB Tasikmalaya, West Java, I..."
10152,37185,JOINT STOCK COMPANY NTV BROADCASTING COMPANY,-0-,RUSSIA-EO14024,-0-,-0-,-0-,-0-,-0-,-0-,-0-,Organization Established Date 1993; Tax ID No....
10153,37196,TELEVISION STATION RUSSIA-1,-0-,RUSSIA-EO14024,-0-,-0-,-0-,-0-,-0-,-0-,-0-,Organization Established Date 13 May 1991; Tar...
10154,37197,JOINT STOCK COMPANY CHANNEL ONE RUSSIA,-0-,RUSSIA-EO14024,-0-,-0-,-0-,-0-,-0-,-0-,-0-,Tax ID No. 7717039300 (Russia); Registration N...


In [4]:
# Filtering Individuals out of the OFAC List
df_individuals = OFAC_list[OFAC_list[2] == 'individual'].reset_index()
df_individuals.shape[0]

5244

In [5]:
# Printing the Individual Rows
df_individuals

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9,10,11
0,53,2674,"ABBAS, Abu",individual,SDGT,Director of PALESTINE LIBERATION FRONT - ABU A...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Dec 1948; Director of PALESTINE LIBERAT...
1,54,2675,"AL RAHMAN, Shaykh Umar Abd",individual,SDGT,Chief Ideological Figure of ISLAMIC GAMA'AT,-0-,-0-,-0-,-0-,-0-,-0-,DOB 03 May 1938; POB Egypt; Chief Ideological ...
2,55,2676,"AL ZAWAHIRI, Dr. Ayman",individual,SDGT,Operational and Military Leader of JIHAD GROUP,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Jun 1951; POB Giza, Egypt; Passport 108..."
3,56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio..."
4,57,2678,"AWDA, Abd Al Aziz",individual,SDGT,Chief Ideological Figure of PALESTINIAN ISLAMI...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 1946; Chief Ideological Figure of PALESTIN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,10147,37180,"ADHIGUNA, Muhammad Dandi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 30 Jul 1996; POB Gresik, East Java, Indone..."
5240,10148,37181,"RAMADHANI, Dini",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Mar 1993; nationality Indonesia; Gender...
5241,10149,37182,"SUSANTI, Dwi Dahlia",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 28 Jul 1976; nationality Indonesia; Gender...
5242,10150,37183,"HERYADI, Rudi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 21 Sep 1973; POB Cirebon, West Java, Indon..."


In [6]:
# Changing column names to make them easier to work with: 
df_individuals = df_individuals.rename(columns = {1 : 'OFAC_Name', 2: 'Entity Type'})

In [7]:
# Checking the previous implementation
df_individuals

Unnamed: 0,index,0,OFAC_Name,Entity Type,3,4,5,6,7,8,9,10,11
0,53,2674,"ABBAS, Abu",individual,SDGT,Director of PALESTINE LIBERATION FRONT - ABU A...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Dec 1948; Director of PALESTINE LIBERAT...
1,54,2675,"AL RAHMAN, Shaykh Umar Abd",individual,SDGT,Chief Ideological Figure of ISLAMIC GAMA'AT,-0-,-0-,-0-,-0-,-0-,-0-,DOB 03 May 1938; POB Egypt; Chief Ideological ...
2,55,2676,"AL ZAWAHIRI, Dr. Ayman",individual,SDGT,Operational and Military Leader of JIHAD GROUP,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Jun 1951; POB Giza, Egypt; Passport 108..."
3,56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio..."
4,57,2678,"AWDA, Abd Al Aziz",individual,SDGT,Chief Ideological Figure of PALESTINIAN ISLAMI...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 1946; Chief Ideological Figure of PALESTIN...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,10147,37180,"ADHIGUNA, Muhammad Dandi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 30 Jul 1996; POB Gresik, East Java, Indone..."
5240,10148,37181,"RAMADHANI, Dini",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Mar 1993; nationality Indonesia; Gender...
5241,10149,37182,"SUSANTI, Dwi Dahlia",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 28 Jul 1976; nationality Indonesia; Gender...
5242,10150,37183,"HERYADI, Rudi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 21 Sep 1973; POB Cirebon, West Java, Indon..."


In [8]:
# Function to switch the Name Order from Last, First Middle --> First Middle Last
def first_name_before_last_name(Orignal_name):
    name_=''
    temp=''
    for i in Orignal_name.lower():
        if i == ',':
            temp=name_
            name_=''
        else:
            name_+=i
    return str.strip(name_+' '+temp)

In [9]:
# Puts Last name: Barge after First name: Neel and coverts the name to lowercase
first_name_before_last_name('Barge, Neel')

'neel barge'

In [10]:
# Keeps the name as is if it is already in the First Middle Last order
first_name_before_last_name('Neel Barge')

'neel barge'

In [11]:
# Adds a column 'OFAC_Name_First_Last' with OFAC Names in the order First Middle Last Name
df_individuals['OFAC_Name_First_Last']=''
for i in range(len(df_individuals['OFAC_Name'])):
    df_individuals['OFAC_Name_First_Last'].iloc[i] = first_name_before_last_name(df_individuals['OFAC_Name'][i])

In [12]:
df_individuals

Unnamed: 0,index,0,OFAC_Name,Entity Type,3,4,5,6,7,8,9,10,11,OFAC_Name_First_Last
0,53,2674,"ABBAS, Abu",individual,SDGT,Director of PALESTINE LIBERATION FRONT - ABU A...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Dec 1948; Director of PALESTINE LIBERAT...,abu abbas
1,54,2675,"AL RAHMAN, Shaykh Umar Abd",individual,SDGT,Chief Ideological Figure of ISLAMIC GAMA'AT,-0-,-0-,-0-,-0-,-0-,-0-,DOB 03 May 1938; POB Egypt; Chief Ideological ...,shaykh umar abd al rahman
2,55,2676,"AL ZAWAHIRI, Dr. Ayman",individual,SDGT,Operational and Military Leader of JIHAD GROUP,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...",dr. ayman al zawahiri
3,56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...",abboud abdul latif hassan al-zomor
4,57,2678,"AWDA, Abd Al Aziz",individual,SDGT,Chief Ideological Figure of PALESTINIAN ISLAMI...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 1946; Chief Ideological Figure of PALESTIN...,abd al aziz awda
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,10147,37180,"ADHIGUNA, Muhammad Dandi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 30 Jul 1996; POB Gresik, East Java, Indone...",muhammad dandi adhiguna
5240,10148,37181,"RAMADHANI, Dini",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Mar 1993; nationality Indonesia; Gender...,dini ramadhani
5241,10149,37182,"SUSANTI, Dwi Dahlia",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 28 Jul 1976; nationality Indonesia; Gender...,dwi dahlia susanti
5242,10150,37183,"HERYADI, Rudi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 21 Sep 1973; POB Cirebon, West Java, Indon...",rudi heryadi


In [13]:
# Removes digits from the Name Variations and makes a separate column for them
def remove_numbers(dataframe, column_name, digits_column):
    dataframe[f'{digits_column}'] = ''
    for i in range(len(dataframe)):
        for c in dataframe[f'{column_name}'][i]: 
            # Moving digits to a different column called Name_variation_digits:
            if c.isdigit():
                dataframe[f'{digits_column}'][i] += c
        # Removing digits from the original Name_variation column
        dataframe[f'{column_name}'][i] = dataframe[f'{column_name}'][i].translate(str.maketrans('', '', string.digits))

In [14]:
# Converts the string to lowercase, removes punctuations
def column_cleaning(dataframe, column_name, clean_column_name):
    dataframe[f'{clean_column_name}'] = ''
    for i in range(len(dataframe)):
        dataframe[f'{clean_column_name}'][i] = str(dataframe[f'{column_name}'][i]) \
            .lower() \
            .translate(str.maketrans('', '', string.punctuation)) \
            .replace("‘", '') \
            .replace('’', '')

In [15]:
# Adds a new column to the OFAC List with punctuations removed 
column_cleaning(df_individuals, 'OFAC_Name_First_Last', 'OFAC_Name_Clean')

In [16]:
# 'OFAC_Name_Clean' column added with punctuations removed
df_individuals

Unnamed: 0,index,0,OFAC_Name,Entity Type,3,4,5,6,7,8,9,10,11,OFAC_Name_First_Last,OFAC_Name_Clean
0,53,2674,"ABBAS, Abu",individual,SDGT,Director of PALESTINE LIBERATION FRONT - ABU A...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Dec 1948; Director of PALESTINE LIBERAT...,abu abbas,abu abbas
1,54,2675,"AL RAHMAN, Shaykh Umar Abd",individual,SDGT,Chief Ideological Figure of ISLAMIC GAMA'AT,-0-,-0-,-0-,-0-,-0-,-0-,DOB 03 May 1938; POB Egypt; Chief Ideological ...,shaykh umar abd al rahman,shaykh umar abd al rahman
2,55,2676,"AL ZAWAHIRI, Dr. Ayman",individual,SDGT,Operational and Military Leader of JIHAD GROUP,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Jun 1951; POB Giza, Egypt; Passport 108...",dr. ayman al zawahiri,dr ayman al zawahiri
3,56,2677,"AL-ZOMOR, Abboud Abdul Latif Hassan",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 19 Apr 1947; POB Nahia, Giza, Egypt; natio...",abboud abdul latif hassan al-zomor,abboud abdul latif hassan alzomor
4,57,2678,"AWDA, Abd Al Aziz",individual,SDGT,Chief Ideological Figure of PALESTINIAN ISLAMI...,-0-,-0-,-0-,-0-,-0-,-0-,DOB 1946; Chief Ideological Figure of PALESTIN...,abd al aziz awda,abd al aziz awda
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5239,10147,37180,"ADHIGUNA, Muhammad Dandi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 30 Jul 1996; POB Gresik, East Java, Indone...",muhammad dandi adhiguna,muhammad dandi adhiguna
5240,10148,37181,"RAMADHANI, Dini",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 10 Mar 1993; nationality Indonesia; Gender...,dini ramadhani,dini ramadhani
5241,10149,37182,"SUSANTI, Dwi Dahlia",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,DOB 28 Jul 1976; nationality Indonesia; Gender...,dwi dahlia susanti,dwi dahlia susanti
5242,10150,37183,"HERYADI, Rudi",individual,SDGT,-0-,-0-,-0-,-0-,-0-,-0-,-0-,"DOB 21 Sep 1973; POB Cirebon, West Java, Indon...",rudi heryadi,rudi heryadi


In [17]:
# Storing all the OFAC Names in the OFAC_List
# Storing the OFAC_Name_Clean in the OFAC_Name_Clean_List
OFAC_List=[]
OFAC_Name_Clean_List = []
for ofac_name, ofac_name_clean in zip(df_individuals['OFAC_Name'], df_individuals['OFAC_Name_Clean']):
    OFAC_List.append(ofac_name.lower())
    OFAC_Name_Clean_List.append(first_name_before_last_name(ofac_name))
print(len(OFAC_List))
print(len(OFAC_Name_Clean_List))
print(OFAC_List)
print(OFAC_Name_Clean_List)


5244
5244
['abbas, abu', 'al rahman, shaykh umar abd', 'al zawahiri, dr. ayman', 'al-zomor, abboud abdul latif hassan', 'awda, abd al aziz', 'fadlallah, shaykh muhammad husayn', 'hawatma, nayif', 'islambouli, mohammad shawqi', 'jabril, ahmad', 'naji, talal muhammad rashid', 'nasrallah, hasan', 'tufayli, subhi', 'yassin, sheik ahmed ismail', 'abu marzook, mousa mohammed', 'rodriguez orejuela, gilberto jose', 'rodriguez orejuela, miguel angel', 'abdallah, ramadan', 'nasser david, julio cesar', 'nasser arana, carlos alberto', 'nasser arana, claudia patricia', 'nasser arana, jorge', 'bin ladin, usama bin muhammad bin awad', 'al-masri, abu hafs', "musa, rifa'i ahmad taha", 'omar, mohammed', 'urdinola grajales, ivan', 'urdinola grajales, julio fabio', 'henao montoya, lorena', 'amezcua contreras, jose de jesus', 'amezcua contreras, luis ignacio', 'arellano felix, ramon eduardo', 'caro quintero, rafael', 'carrillo fuentes, vicente', 'chang, chi fu', 'heath, noel timothy', 'ogungbuyi, oluwole a

In [18]:
# Reading the provided Test Data
Test_Data = pd.read_excel('ABCNY Performance Metrics.xlsx', sheet_name=[1])

In [19]:
# Filtering out the Individual Names
Individuals = Test_Data[1][Test_Data[1]['Entity Type'] == 'Individual'].reset_index()

In [20]:
# Displaying the DataFrame
pd.set_option('display.max_rows', None)
Individuals.head()

Unnamed: 0,index,Rec. No,Theme,Category,Subcategory,Entity Type,List Name,Original List Name,Name Variation,ACCUITY ID,# of TPs,Volume
0,115,116,Combinations,Combinations,All given name initials with space between eac...,Individual,ANDREY KONSTANTINOVICH LUGOVOY,"LUGOVOY, ANDREY KONSTANTINOVICH",A. K. L U G O V O Y,AS00850517,1,7
1,116,117,Combinations,Combinations,1 special character removed and 2 words combined,Individual,MUHAMMAD ABD-AL-QADIR MUTNI ASSAF AL-RAWI,"AL-RAWI, MUHAMMAD ABD-AL-QADIR MUTNI ASSAF",MUHAMMAD ABDAL-QADIR MUTNIASSAF AL-RAWI,AS06223109,1,84
2,117,118,Combinations,Combinations,1 special character added and 1 letter replace...,Individual,ABD EL KADER MAHMOUD MOHAMED EL SAYED,"EL SAYED, ABD EL KADER MAHMOUD MOHAMED",ABD-EL KADER MAHM0UD MOHAMED EL SAYED,AS00022081,1,37
3,118,119,Combinations,Combinations,2 typos same word non adjacent and 1 letter re...,Individual,ABUBAKAR MOHAMMED SHEKAU,"SHEKAU, ABUBAKAR MOHAMMED",ABOBEKAR M#HAMMED SHEKAU,AS00728855,1,12
4,119,120,Combinations,Combinations,2 truncations and variation of 2 letters,Individual,AIMAN MUHAMMED RABI AL-ZAWAHIRI,"AL-ZAWAHIRI, AIMAN MUHAMMED RABI",AIMA MUHAMMAD RAB AL-JAWAHIRI,AS00020027,0,14


In [21]:
for i in range(len(Individuals['Original List Name'])):
    Individuals['Original List Name'].iloc[i] = Individuals['Original List Name'][i].lower()
    Individuals['List Name'].iloc[i] = str(Individuals['List Name'][i]).lower()

In [22]:
Individuals.head()

Unnamed: 0,index,Rec. No,Theme,Category,Subcategory,Entity Type,List Name,Original List Name,Name Variation,ACCUITY ID,# of TPs,Volume
0,115,116,Combinations,Combinations,All given name initials with space between eac...,Individual,andrey konstantinovich lugovoy,"lugovoy, andrey konstantinovich",A. K. L U G O V O Y,AS00850517,1,7
1,116,117,Combinations,Combinations,1 special character removed and 2 words combined,Individual,muhammad abd-al-qadir mutni assaf al-rawi,"al-rawi, muhammad abd-al-qadir mutni assaf",MUHAMMAD ABDAL-QADIR MUTNIASSAF AL-RAWI,AS06223109,1,84
2,117,118,Combinations,Combinations,1 special character added and 1 letter replace...,Individual,abd el kader mahmoud mohamed el sayed,"el sayed, abd el kader mahmoud mohamed",ABD-EL KADER MAHM0UD MOHAMED EL SAYED,AS00022081,1,37
3,118,119,Combinations,Combinations,2 typos same word non adjacent and 1 letter re...,Individual,abubakar mohammed shekau,"shekau, abubakar mohammed",ABOBEKAR M#HAMMED SHEKAU,AS00728855,1,12
4,119,120,Combinations,Combinations,2 truncations and variation of 2 letters,Individual,aiman muhammed rabi al-zawahiri,"al-zawahiri, aiman muhammed rabi",AIMA MUHAMMAD RAB AL-JAWAHIRI,AS00020027,0,14


In [23]:
elements=[] # Stores a tuple with Record Number, Original List Name and List Name for Individuals that are part of the OFAC List
nonelements=[] # Stores a tuple with Record Number, Original List Name and List Name for Individuals that are NOT part of the OFAC List
for recordno,oln,ln in zip(Individuals['Rec. No'],Individuals['Original List Name'],Individuals['List Name']):
    if oln in OFAC_List or ln in OFAC_Name_Clean_List or oln in OFAC_Name_Clean_List or ln in OFAC_List:
        elements.append((recordno,oln,ln,max(OFAC_List.count(oln),OFAC_List.count(ln),OFAC_Name_Clean_List.count(oln),OFAC_Name_Clean_List.count(ln))))
    else:
        nonelements.append((recordno,oln,ln,max(OFAC_List.count(oln),OFAC_List.count(ln),OFAC_Name_Clean_List.count(oln),OFAC_Name_Clean_List.count(ln))))
print(len(elements)) # Individual names part of the OFAC List - 641
print(len(nonelements))
print(nonelements) # Individual names not present in the OFAC List - 106

641
106
[(118, 'el sayed, abd el kader mahmoud mohamed', 'abd el kader mahmoud mohamed el sayed', 0), (119, 'shekau, abubakar mohammed', 'abubakar mohammed shekau', 0), (120, 'al-zawahiri, aiman muhammed rabi', 'aiman muhammed rabi al-zawahiri', 0), (123, 'al-maaroufi, tarek ben habib ben al-toumi', 'tarek ben habib ben al-toumi al-maaroufi', 0), (133, 'delos reyes jr., feliciano semborio', 'feliciano semborio delos reyes jr.', 0), (139, 'hameem m. abdu al-waly abdu al-azeez', 'hameem m. abdu al-waly abdu al-azeez', 0), (150, 'juarez de la roca, roberto eliud', 'juarez de la roca, roberto eliud', 0), (151, 'juarez de la roca, roberto eliud', 'juarez de la roca, roberto eliud', 0), (167, 'xiao liu', 'xiao liu', 0), (249, 'wali mohammad, abdul jalil haqqani', 'abdul jalil haqqani wali mohammad', 0), (251, 'bader, mohammed maher yousef', 'mohammed maher yousef bader', 0), (253, 'el sayed, abd el kader mahmoud mohamed', 'abd el kader mahmoud mohamed el sayed', 0), (255, 'al ajmi, hajjaj bi

In [24]:
# OFAC List Name -> Test Data Name

# shekau, abubakar -> SHEKAU, ABUBAKAR MOHAMMED
# es sayed, abdelkader mahmoud -> EL SAYED, ABD EL KADER MAHMOUD MOHAMED
# maaroufi, tarek ben habib ben al-toumishaqiri, xhezair --> AL-MAAROUFI, TAREK BEN HABIB BEN AL-TOUMI
# timchenko, gennady nikolayevich --> GENNADY TIMCHENKO
# Not present -> CUE ZAO
# rotenberg, roman borisovich -> ROTENBERG, ROMAN

if 'ROTENBERG, ROMAN'.lower() in df_individuals['OFAC_Name']:
    print(1)
else:
    print(0)
# As observed above 106 'Test Data names not found in the OFAC List' 
# are some variations with missing Middle Names, Missing letter or  Not At all present

0


In [25]:
print(len(Individuals[Individuals['# of TPs'] == 0]))
print(len(Individuals[Individuals['# of TPs'] == 1]))
print(len(Individuals[Individuals['# of TPs'] >= 2]))
print(85+648+14) # Total
# Count of Individual Names with 0 number of True Positives in the Test data = 85
# Count of Individual Names with 1 True Positive in the Test data = 648
# Count of Individual Names with more tahn 1 True Positive in the Test data = 14

85
648
14
747


In [26]:
# Adding two columns to Test_Data - # of TPs(Obtained), Volume(Obtained)
# Here, '# of TPs(Obtained)' indicates if the Original List Name or List Name actually exists in the OFAC List
#  1 - 'It Exists'
#  0 - 'It does NOT Exist'
# Volume(Obtained) - indicates the volume of hits for a particular Name Variation with Double Metaphone with Levenshtein
Individuals['# of TPs(Obtained)']=0
for i in elements:
    for k in range(len(Individuals['Rec. No'])):
        if i[0] == Individuals['Rec. No'][k]:
            Individuals['# of TPs(Obtained)'].iloc[k]=i[3]

In [27]:
print(len(Individuals[Individuals['# of TPs(Obtained)']>1]))
print(len(Individuals[Individuals['# of TPs(Obtained)'] == 1]))
print(len(Individuals[Individuals['# of TPs(Obtained)'] == 0]))

0
641
106


In [28]:
# Converting all the Name Variations to string type
for i in range(len(Individuals['Name Variation'])):
    Individuals['Name Variation'].iloc[i] = str(Individuals['Name Variation'][i])

In [29]:
# Removed all the Digits from the Name Variation
remove_numbers(Individuals, 'Name Variation', 'Digits Removed')

In [30]:
Individuals.tail(5)

Unnamed: 0,index,Rec. No,Theme,Category,Subcategory,Entity Type,List Name,Original List Name,Name Variation,ACCUITY ID,# of TPs,Volume,# of TPs(Obtained),Digits Removed
742,2581,2582,IDs,Digital Currency Address,With space and period,Individual,"potekhin, danil","potekhin, danil",.xFcCcEebfbeADEbe B,AS06306302,2,12,1,0736741522075538233798891
743,2582,2583,IDs,Digital Currency Address,With space and period,Individual,"andreyev, anton nikolaeyvich","andreyev, anton nikolaeyvich",X. ACCCDCEFEBFBDFCFC,AS06305898,1,7,1,085765056884496506291353
744,2583,2584,Names where name parts are Modified,Combined Name Split,Combined Name Split,Individual,"santos, ahmad (ahmed)","santos, ahmad (ahmed)","SANTOS, Ahmad",AS00229428,1,16,1,
745,2584,2585,Names where name parts are Modified,Combined Name Split,Combined Name Split,Individual,"santos, ahmad (ahmed)","santos, ahmad (ahmed)","SANTOS, Ahmad Ahmed",AS00229428,1,42,1,
746,2585,2586,Names where name parts are Modified,Combined Name Split,Combined Name Split,Individual,"santos, ahmad (ahmed)","santos, ahmad (ahmed)","Santos, Ahmed",AS00229428,1,18,1,


In [31]:
# For comparing the Name Variation, we would change name order to First Middle Last and change to lower case
Individuals['Name Variation First Last']=''
for i in range(len(Individuals['Name Variation'])):
    Individuals['Name Variation First Last'].iloc[i] = first_name_before_last_name(Individuals['Name Variation'][i])

In [32]:
# Removing the punctuations in the Name
column_cleaning(Individuals, 'Name Variation First Last', 'Name Variation Clean')

In [33]:
Individuals.head(10)

Unnamed: 0,index,Rec. No,Theme,Category,Subcategory,Entity Type,List Name,Original List Name,Name Variation,ACCUITY ID,# of TPs,Volume,# of TPs(Obtained),Digits Removed,Name Variation First Last,Name Variation Clean
0,115,116,Combinations,Combinations,All given name initials with space between eac...,Individual,andrey konstantinovich lugovoy,"lugovoy, andrey konstantinovich",A. K. L U G O V O Y,AS00850517,1,7,1,,a. k. l u g o v o y,a k l u g o v o y
1,116,117,Combinations,Combinations,1 special character removed and 2 words combined,Individual,muhammad abd-al-qadir mutni assaf al-rawi,"al-rawi, muhammad abd-al-qadir mutni assaf",MUHAMMAD ABDAL-QADIR MUTNIASSAF AL-RAWI,AS06223109,1,84,1,,muhammad abdal-qadir mutniassaf al-rawi,muhammad abdalqadir mutniassaf alrawi
2,117,118,Combinations,Combinations,1 special character added and 1 letter replace...,Individual,abd el kader mahmoud mohamed el sayed,"el sayed, abd el kader mahmoud mohamed",ABD-EL KADER MAHMUD MOHAMED EL SAYED,AS00022081,1,37,0,0.0,abd-el kader mahmud mohamed el sayed,abdel kader mahmud mohamed el sayed
3,118,119,Combinations,Combinations,2 typos same word non adjacent and 1 letter re...,Individual,abubakar mohammed shekau,"shekau, abubakar mohammed",ABOBEKAR M#HAMMED SHEKAU,AS00728855,1,12,0,,abobekar m#hammed shekau,abobekar mhammed shekau
4,119,120,Combinations,Combinations,2 truncations and variation of 2 letters,Individual,aiman muhammed rabi al-zawahiri,"al-zawahiri, aiman muhammed rabi",AIMA MUHAMMAD RAB AL-JAWAHIRI,AS00020027,0,14,0,,aima muhammad rab al-jawahiri,aima muhammad rab aljawahiri
5,120,121,Combinations,Combinations,"Truncation at the beginning, 1 compression",Individual,jairo humberto lopera barbosa,"lopera barbosa, jairo humberto",AIRO HUMBERTOLOPERA BARBOSA,AS00178815,1,15,1,,airo humbertolopera barbosa,airo humbertolopera barbosa
6,121,122,Combinations,Combinations,"Removal of special characters, compression, an...",Individual,muhammad hadi 'abd-al-rahman fayhan sharban al...,"al-_x0003_anzi, muhammad hadi _x0003_abd-al-ra...",AL MUHAMMADHADIABDALRAHMANFAYHAN SHARBAN ALANZI,AS00095739,1,8,1,,al muhammadhadiabdalrahmanfayhan sharban alanzi,al muhammadhadiabdalrahmanfayhan sharban alanzi
7,122,123,Combinations,Combinations,Compression and Name Part added,Individual,tarek ben habib ben al-toumi al-maaroufi,"al-maaroufi, tarek ben habib ben al-toumi",AL TAREKBEN HABIB BEN AL-TOUMI AL-MAAROUFI,AS00033155,1,11,0,,al tarekben habib ben al-toumi al-maaroufi,al tarekben habib ben altoumi almaaroufi
8,123,124,Combinations,Combinations,First / middle name swapped with 1 character r...,Individual,aleksey aleksandrovich kostrubitsky,"kostrubitsky, aleksey aleksandrovich",ALEKSANDROVICH ALEKSEY KOSTRUBITKY,AS00258040,1,8,1,,aleksandrovich aleksey kostrubitky,aleksandrovich aleksey kostrubitky
9,124,125,Combinations,Combinations,Typo and Name Part added,Individual,aleksey aleksandrovich dikiy,"dikiy, aleksey aleksandrovich",ALEKSEY VLADAMIR ALEKSANDROVICH DIKEY,AS00258045,1,7,1,,aleksey vladamir aleksandrovich dikey,aleksey vladamir aleksandrovich dikey


In [34]:
def doublemetaphone_with_leven_word_by_word(OFAC_list_Individuals, Test_data_sample, doublemataphone_Cutoff_List, Levenshtein_Cutoff):
#     dict={}
    Individuals['VOHs(DoubleMetaphone)'] = 0
    Individuals['TP predicted by doublemetaphone'] = 0
    for cut_off in doublemataphone_Cutoff_List:
        print('cut_off:',cut_off)
        Scored_dataframe=pd.DataFrame(columns=['Name_Variation_compared','OFAC_Name','Subcategory','# of TPs','Volume','# of TPs(Obtained)']) 
        for i in range(len(Test_data_sample)):
            print(' i:',i)
            count = 0
            TP = 0
            for j in range(len(OFAC_list_Individuals)):

                Ofac_name = OFAC_list_Individuals['OFAC_Name'][j]
                Ofac_name_clean = OFAC_list_Individuals['OFAC_Name_Clean'][j]
                Name_Variation = Test_data_sample['Name Variation Clean'][i]
                Orignal_list_name = Test_data_sample['Original List Name'][i]
                List_name = Test_data_sample['List Name'][i]
                Transformation = Test_data_sample['Subcategory'][i]
                No_of_TPs = Test_data_sample['# of TPs'][i]
                Volume = Test_data_sample['Volume'][i]
                No_of_TPs_obtained = Test_data_sample['# of TPs(Obtained)'][i]
                
                
            
                if doublemetaphone(Ofac_name_clean) == doublemetaphone(Name_Variation):
                    count+=1
                    if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                        TP+= 1
#                     if Orignal_Name_OFAC == Orignal_Name_Test:
#                         doublemetaphone_score = 1
                    Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                else:
                    original_list = Ofac_name_clean.split()
                    test_list = Name_Variation.split()
                    n_matches = 0
                    if ''.join(original_list).replace(' ','') == ''.join(test_list).replace(' ',''):
                        count+=1
                        if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                            TP+= 1
                        Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                    else:
                        for n in range(len(original_list)):
                            for m in range(len(test_list)):
                                if doublemetaphone(original_list[n])  == doublemetaphone(test_list[m]):
                                    n_matches += 1 
                        if n_matches / len(original_list) >= cut_off:
                            count+=1
                            if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                                TP+= 1
                            Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                        else:
                             if enchant.utils.levenshtein(Ofac_name_clean,Name_Variation) <= Levenshtein_Cutoff:
                                count+=1
                                if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                                    TP+= 1
                                Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
            Individuals['VOHs(DoubleMetaphone)'].iloc[i] = count  
            Individuals['TP predicted by doublemetaphone'].iloc[i] = TP
#         dict[cut_off] = Scored_dataframe
    return Scored_dataframe

In [35]:
df = doublemetaphone_with_leven_word_by_word(df_individuals, Individuals, [0.5], 3)

cut_off: 0.5
 i: 0
 i: 1
 i: 2
 i: 3
 i: 4
 i: 5
 i: 6
 i: 7
 i: 8
 i: 9
 i: 10
 i: 11
 i: 12
 i: 13
 i: 14
 i: 15
 i: 16
 i: 17
 i: 18
 i: 19
 i: 20
 i: 21
 i: 22
 i: 23
 i: 24
 i: 25
 i: 26
 i: 27
 i: 28
 i: 29
 i: 30
 i: 31
 i: 32
 i: 33
 i: 34
 i: 35
 i: 36
 i: 37
 i: 38
 i: 39
 i: 40
 i: 41
 i: 42
 i: 43
 i: 44
 i: 45
 i: 46
 i: 47
 i: 48
 i: 49
 i: 50
 i: 51
 i: 52
 i: 53
 i: 54
 i: 55
 i: 56
 i: 57
 i: 58
 i: 59
 i: 60
 i: 61
 i: 62
 i: 63
 i: 64
 i: 65
 i: 66
 i: 67
 i: 68
 i: 69
 i: 70
 i: 71
 i: 72
 i: 73
 i: 74
 i: 75
 i: 76
 i: 77
 i: 78
 i: 79
 i: 80
 i: 81
 i: 82
 i: 83
 i: 84
 i: 85
 i: 86
 i: 87
 i: 88
 i: 89
 i: 90
 i: 91
 i: 92
 i: 93
 i: 94
 i: 95
 i: 96
 i: 97
 i: 98
 i: 99
 i: 100
 i: 101
 i: 102
 i: 103
 i: 104
 i: 105
 i: 106
 i: 107
 i: 108
 i: 109
 i: 110
 i: 111
 i: 112
 i: 113
 i: 114
 i: 115
 i: 116
 i: 117
 i: 118
 i: 119
 i: 120
 i: 121
 i: 122
 i: 123
 i: 124
 i: 125
 i: 126
 i: 127
 i: 128
 i: 129
 i: 130
 i: 131
 i: 132
 i: 133
 i: 134
 i: 135
 i: 136
 

In [36]:
print(Individuals['VOHs(DoubleMetaphone)'].sum()) # VOHs: 24284
print(Individuals['Volume'].sum())
print(Individuals.head())
Individuals['TP predicted by doublemetaphone'].sum() # TPs catured by DoubleMetaphone: 417
TP_percent = Individuals['TP predicted by doublemetaphone'].sum()/Individuals['# of TPs(Obtained)'].sum()
TP_percent # TP%(Double Metaphone): 65.05%

24284
11807
   index  Rec. No         Theme      Category  \
0    115      116  Combinations  Combinations   
1    116      117  Combinations  Combinations   
2    117      118  Combinations  Combinations   
3    118      119  Combinations  Combinations   
4    119      120  Combinations  Combinations   

                                         Subcategory Entity Type  \
0  All given name initials with space between eac...  Individual   
1   1 special character removed and 2 words combined  Individual   
2  1 special character added and 1 letter replace...  Individual   
3  2 typos same word non adjacent and 1 letter re...  Individual   
4           2 truncations and variation of 2 letters  Individual   

                                   List Name  \
0             andrey konstantinovich lugovoy   
1  muhammad abd-al-qadir mutni assaf al-rawi   
2      abd el kader mahmoud mohamed el sayed   
3                   abubakar mohammed shekau   
4            aiman muhammed rabi al-zawahiri

0.6505460218408736

# Soundex(cutoff:0.5) with Levenshtein Distance(6)

In [37]:
def soundex_with_leven_word_by_word(OFAC_list_Individuals, Test_data_sample, doublemataphone_Cutoff_List, Levenshtein_Cutoff):

    Individuals['VOHs(Soundex)'] = 0
    Individuals['TP predicted by soundex'] = 0
    
    for cut_off in doublemataphone_Cutoff_List:
        print('cut_off:',cut_off)
        Scored_dataframe=pd.DataFrame(columns=['Name_Variation_compared','OFAC_Name','Subcategory','# of TPs','Volume','# of TPs(Obtained)']) 
        for i in range(len(Test_data_sample)):
            print(' i:',i)
            count = 0
            TP = 0
            for j in range(len(OFAC_list_Individuals)):

                Ofac_name = OFAC_list_Individuals['OFAC_Name'][j]
                Ofac_name_clean = OFAC_list_Individuals['OFAC_Name_Clean'][j]
                Name_Variation = Test_data_sample['Name Variation Clean'][i]
                Orignal_list_name = Test_data_sample['Original List Name'][i]
                List_name = Test_data_sample['List Name'][i]
                Transformation = Test_data_sample['Subcategory'][i]
                No_of_TPs = Test_data_sample['# of TPs'][i]
                Volume = Test_data_sample['Volume'][i]
                No_of_TPs_obtained = Test_data_sample['# of TPs(Obtained)'][i]
                
                
            
                if (jellyfish.soundex(Ofac_name_clean) == jellyfish.soundex(Name_Variation)):
                    count+=1
                    if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                        TP+= 1
#                     if Orignal_Name_OFAC == Orignal_Name_Test:
#                         doublemetaphone_score = 1
                    Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                else:
                    original_list = Ofac_name_clean.split()
                    test_list = Name_Variation.split()
                    n_matches = 0
                    if ''.join(original_list).replace(' ','') == ''.join(test_list).replace(' ',''):
                        count+=1
                        if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                            TP+= 1
                        Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                    else:
                        for n in range(len(original_list)):
                            for m in range(len(test_list)):
                                if jellyfish.soundex(original_list[n])  == jellyfish.soundex(test_list[m]):
                                    n_matches += 1 
                        if n_matches / len(original_list) >= cut_off:
                            count+=1
                            if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                                TP+= 1
                            Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
                        else:
                             if enchant.utils.levenshtein(Ofac_name_clean,Name_Variation) <= Levenshtein_Cutoff:
                                count+=1
                                if Ofac_name == Orignal_list_name or Ofac_name == List_name or Ofac_name_clean == Orignal_list_name or Ofac_name_clean == List_name:
                                    TP+= 1
                                Scored_dataframe.loc[len(Scored_dataframe.index)]=[Ofac_name, Name_Variation, Transformation, No_of_TPs, Volume, No_of_TPs_obtained]
            Individuals['VOHs(Soundex)'].iloc[i] = count  
            Individuals['TP predicted by soundex'].iloc[i] = TP

    return Scored_dataframe

In [38]:
df2 = soundex_with_leven_word_by_word(df_individuals, Individuals, [0.5], 6)

cut_off: 0.5
 i: 0
 i: 1
 i: 2
 i: 3
 i: 4
 i: 5
 i: 6
 i: 7
 i: 8
 i: 9
 i: 10
 i: 11
 i: 12
 i: 13
 i: 14
 i: 15
 i: 16
 i: 17
 i: 18
 i: 19
 i: 20
 i: 21
 i: 22
 i: 23
 i: 24
 i: 25
 i: 26
 i: 27
 i: 28
 i: 29
 i: 30
 i: 31
 i: 32
 i: 33
 i: 34
 i: 35
 i: 36
 i: 37
 i: 38
 i: 39
 i: 40
 i: 41
 i: 42
 i: 43
 i: 44
 i: 45
 i: 46
 i: 47
 i: 48
 i: 49
 i: 50
 i: 51
 i: 52
 i: 53
 i: 54
 i: 55
 i: 56
 i: 57
 i: 58
 i: 59
 i: 60
 i: 61
 i: 62
 i: 63
 i: 64
 i: 65
 i: 66
 i: 67
 i: 68
 i: 69
 i: 70
 i: 71
 i: 72
 i: 73
 i: 74
 i: 75
 i: 76
 i: 77
 i: 78
 i: 79
 i: 80
 i: 81
 i: 82
 i: 83
 i: 84
 i: 85
 i: 86
 i: 87
 i: 88
 i: 89
 i: 90
 i: 91
 i: 92
 i: 93
 i: 94
 i: 95
 i: 96
 i: 97
 i: 98
 i: 99
 i: 100
 i: 101
 i: 102
 i: 103
 i: 104
 i: 105
 i: 106
 i: 107
 i: 108
 i: 109
 i: 110
 i: 111
 i: 112
 i: 113
 i: 114
 i: 115
 i: 116
 i: 117
 i: 118
 i: 119
 i: 120
 i: 121
 i: 122
 i: 123
 i: 124
 i: 125
 i: 126
 i: 127
 i: 128
 i: 129
 i: 130
 i: 131
 i: 132
 i: 133
 i: 134
 i: 135
 i: 136
 

In [39]:
print(Individuals['VOHs(Soundex)'].sum()) # VOHs: 42968
print(Individuals['Volume'].sum()) # VOHs by ABCNY: 11807
print(Individuals.head())
print(Individuals['TP predicted by soundex'].sum()) # TPs captured by Soundex: 423
TP_percent = Individuals['TP predicted by soundex'].sum()/Individuals['# of TPs(Obtained)'].sum() 
print(TP_percent) # TP%(Soundex) = 65.99%

42968
11807
   index  Rec. No         Theme      Category  \
0    115      116  Combinations  Combinations   
1    116      117  Combinations  Combinations   
2    117      118  Combinations  Combinations   
3    118      119  Combinations  Combinations   
4    119      120  Combinations  Combinations   

                                         Subcategory Entity Type  \
0  All given name initials with space between eac...  Individual   
1   1 special character removed and 2 words combined  Individual   
2  1 special character added and 1 letter replace...  Individual   
3  2 typos same word non adjacent and 1 letter re...  Individual   
4           2 truncations and variation of 2 letters  Individual   

                                   List Name  \
0             andrey konstantinovich lugovoy   
1  muhammad abd-al-qadir mutni assaf al-rawi   
2      abd el kader mahmoud mohamed el sayed   
3                   abubakar mohammed shekau   
4            aiman muhammed rabi al-zawahiri

# Cosine Similarity(cutofff:0.5) with Levenshtein Distance(cutoff1:15, cutoff2:2)

In [40]:
true_pos_cos = []
voh_cos = []

In [41]:
# Cosine Similarity
def cosine_similarity(Test_data_sample, OFAC_list_Individuals, Cutoff, Lev_cutoff_1, Lev_cutoff_2):
    
    Individuals['VOHs(Cosine)'] = 0
    Individuals['TP predicted by Cosine'] = 0
    
    df_scores = pd.DataFrame(columns = {'row_score','OFAC_Name','Original List Name','List Name','Name Variation', 'TP predicted by Cosine','Transformation'})
    scores = []
    og_name = []
    test_name = []
    actual = []
    original_list_name_test = []
    list_name_test = []
    transformations = []
    
    list_var = Test_data_sample['Name Variation Clean'].to_list() # Name Variation Clean List
    list_name = OFAC_list_Individuals['OFAC_Name_Clean'].to_list() # OFAC Name Clean List

    # Converting strings to vectors to calculate cosine similarity score
    vectorizer = CountVectorizer()

    vectorizer.fit(list_name)
    
    vector1 = vectorizer.transform(list_var).toarray()
    vector2 = vectorizer.transform(list_name).toarray()

    for i in range(len(vector1)):## Test Variation
        count=0
        TP=0
        for j in range(len(vector2)):## OFAC List 
            
            row_score = dot(vector1[i], vector2[j])/(norm(vector1[i]) * norm(vector2[j]))
            ofac_name = OFAC_list_Individuals['OFAC_Name_Clean'][j]
            test_name_row = Test_data_sample['Name Variation Clean'][i]
            orignal_list_name = Test_data_sample['Original List Name'][i]
            list_name = Test_data_sample['List Name'][i] 
            transform = Test_data_sample['Subcategory'][i]
            lev_dist = jellyfish.levenshtein_distance(str(Test_data_sample['Name Variation Clean'][i]),str(OFAC_list_Individuals['OFAC_Name_Clean'][j]))
            row_actual = 0
            
            
            if row_score >= Cutoff:
                if lev_dist <= Lev_cutoff_1:
                    count+=1
                    if ofac_name == orignal_list_name or ofac_name == list_name:
                        row_actual = 1
                        TP+=1
                    
                    scores.append(row_score) # Cosine Score
                    og_name.append(ofac_name) # OFAC Name
                    test_name.append(test_name_row) # Name Variation Clean
                    actual.append(row_actual) # TP captured by Cosine and Levenshtein: 1 or 0
                    original_list_name_test.append(orignal_list_name) # Original List Name
                    list_name_test.append(list_name) # List Name 
                    transformations.append(transform) # Subcategory
                    
            else:
                if lev_dist <= Lev_cutoff_2:
                    count+=1
                    if ofac_name == orignal_list_name or ofac_name == list_name:
                        row_actual = 1
                        TP+=1
                                 
                    scores.append(row_score) # Cosine Score
                    og_name.append(ofac_name) # OFAC Name
                    test_name.append(test_name_row) # Name Variation Clean
                    actual.append(row_actual) # TP captured by Levenshtein: 1 or 0
                    original_list_name_test.append(orignal_list_name) # Original List Name
                    list_name_test.append(list_name) # List Name 
                    transformations.append(transform) # Subcategory
                
        Individuals['VOHs(Cosine)'].iloc[i] = count
        Individuals['TP predicted by Cosine'].iloc[i] = TP
        
    df_scores['row_score'] = scores
    df_scores['OFAC_Name'] = og_name
    df_scores['Original List Name'] = original_list_name_test
    df_scores['List Name'] = list_name_test
    df_scores['Name Variation'] = test_name
    df_scores['TP predicted by Cosine'] = actual
    df_scores['Transformation'] = transformations
    
    true_positive = sum(df_scores['TP predicted by Cosine'])
    true_pos_cos.append(true_positive)
    voh_cos.append(df_scores.shape[0])

#     df_scores.to_csv('df'+ str(Cutoff) + '.csv')
    
    return df_scores

In [42]:
df3 = cosine_similarity(Individuals,df_individuals,0.5,15,2)

In [43]:
print(true_pos_cos) # True Positives captured by cosine similarity with levenshtein: 384
print(voh_cos) # Total Volume of Hits: 1807

[384]
[1807]


In [44]:
TP_percent = true_pos_cos[0]/Individuals['# of TPs(Obtained)'].sum()*100
TP_percent # TP%(Cosine) = 59.90%

59.90639625585024

In [45]:
Individuals['# of TPs'].value_counts()

1    648
0     85
2     14
Name: # of TPs, dtype: int64

In [60]:
# Getting Count of Actual TPs per Theme
Test_data_theme = Individuals[Individuals['# of TPs(Obtained)'] != 0]['Theme'].value_counts().reset_index()
Test_data_theme

Unnamed: 0,index,Theme
0,Names where Name Parts are Modified,211
1,Special Characters and Spaces,100
2,Name Additions,68
3,Name Part Variations,56
4,Modified Order,54
5,Combinations,45
6,Name Deletions,38
7,Different Name Parts,28
8,IDs,27
9,Positive Control,6


In [61]:
new_df=Individuals[Individuals['# of TPs(Obtained)'] != 0]
# Considering only 641 Individuals as Actual True Positives

In [91]:
new_df.head()

Unnamed: 0,index,Rec. No,Theme,Category,Subcategory,Entity Type,List Name,Original List Name,Name Variation,ACCUITY ID,...,# of TPs(Obtained),Digits Removed,Name Variation First Last,Name Variation Clean,VOHs(DoubleMetaphone),TP predicted by doublemetaphone,VOHs(Soundex),TP predicted by soundex,VOHs(Cosine),TP predicted by Cosine
0,115,116,Combinations,Combinations,All given name initials with space between eac...,Individual,andrey konstantinovich lugovoy,"lugovoy, andrey konstantinovich",A. K. L U G O V O Y,AS00850517,...,1,,a. k. l u g o v o y,a k l u g o v o y,103,0,68,0,0,0
1,116,117,Combinations,Combinations,1 special character removed and 2 words combined,Individual,muhammad abd-al-qadir mutni assaf al-rawi,"al-rawi, muhammad abd-al-qadir mutni assaf",MUHAMMAD ABDAL-QADIR MUTNIASSAF AL-RAWI,AS06223109,...,1,,muhammad abdal-qadir mutniassaf al-rawi,muhammad abdalqadir mutniassaf alrawi,101,0,235,0,1,0
5,120,121,Combinations,Combinations,"Truncation at the beginning, 1 compression",Individual,jairo humberto lopera barbosa,"lopera barbosa, jairo humberto",AIRO HUMBERTOLOPERA BARBOSA,AS00178815,...,1,,airo humbertolopera barbosa,airo humbertolopera barbosa,5,1,4,1,2,1
6,121,122,Combinations,Combinations,"Removal of special characters, compression, an...",Individual,muhammad hadi 'abd-al-rahman fayhan sharban al...,"al-_x0003_anzi, muhammad hadi _x0003_abd-al-ra...",AL MUHAMMADHADIABDALRAHMANFAYHAN SHARBAN ALANZI,AS00095739,...,1,,al muhammadhadiabdalrahmanfayhan sharban alanzi,al muhammadhadiabdalrahmanfayhan sharban alanzi,58,0,81,0,0,0
8,123,124,Combinations,Combinations,First / middle name swapped with 1 character r...,Individual,aleksey aleksandrovich kostrubitsky,"kostrubitsky, aleksey aleksandrovich",ALEKSANDROVICH ALEKSEY KOSTRUBITKY,AS00258040,...,1,,aleksandrovich aleksey kostrubitky,aleksandrovich aleksey kostrubitky,12,1,149,1,0,0


In [69]:
# Match_doublemetaphone stores the Count of Matches(TP predicted by doubleMetaphone) per Theme for Double Metaphone algorithm
# Similarly, Missed_doublemetaphone stores the Count of Name Variation missed by Double Metaphone algorithm
Match_doublemetaphone = new_df[new_df['TP predicted by doublemetaphone'] != 0].Theme.value_counts().to_frame().reset_index()
Match_soundex = new_df[new_df['TP predicted by soundex'] != 0].Theme.value_counts().to_frame().reset_index()
Match_cosine = new_df[new_df['TP predicted by Cosine'] != 0].Theme.value_counts().to_frame().reset_index()
Missed_doublemetaphone = new_df[new_df['TP predicted by doublemetaphone'] == 0].Theme.value_counts().to_frame().reset_index()
Missed_soundex = new_df[new_df['TP predicted by soundex'] == 0].Theme.value_counts().to_frame().reset_index()
Missed_cosine = new_df[new_df['TP predicted by Cosine'] == 0].Theme.value_counts().to_frame().reset_index()

In [99]:
# Theme_VOHs_doubleMetaphone stores the count of total Volume of Hits generated under each Theme by 
# Double Metaphone algorithm for all 741 Name Variations
Theme_VOHs_doubleMetaphone = Individuals.groupby(by=['Theme'])['VOHs(DoubleMetaphone)'].sum().to_frame().reset_index()
Theme_VOHs_soundex = Individuals.groupby(by=['Theme'])['VOHs(Soundex)'].sum().to_frame().reset_index()
Theme_VOHs_cosine = Individuals.groupby(by=['Theme'])['VOHs(Cosine)'].sum().to_frame().reset_index()

In [103]:
Theme_VOHs_cosine

Unnamed: 0,Theme,VOHs(Cosine)
0,Combinations,165
1,Different Name Parts,96
2,Foreign Languages,0
3,IDs,0
4,Modified Order,130
5,Name Additions,108
6,Name Deletions,51
7,Name Part Variations,279
8,Names where Name Parts are Modified,679
9,Names where name parts are Modified,3


In [110]:
# Merges the Volume of Hits per theme for all the Algorithms in a single dataframe
MergedVOHs = pd.merge(pd.merge(Theme_VOHs_doubleMetaphone, Theme_VOHs_soundex, on='Theme', how='left'),Theme_VOHs_cosine, on='Theme', how='left')

In [108]:
# Merges the TP count per theme for all the Algorithms in a single dataframe 
MergedTP = pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(pd.merge(Test_data_theme, Match_doublemetaphone, on='index', how='left'),Missed_doublemetaphone, on='index',how='left'),Match_soundex,on='index', how='left'),Missed_soundex, on='index',how='left'), Match_cosine, on='index',how='left'), Missed_cosine, on='index',how='left')

In [112]:
# Naming the columns for MergeTP
MergedTP.columns=['Theme','# of TP(Obtained)','Matched(DoubleMetaphone)','Missed(DoubleMetaphone)','Matched(Soundex)','Missed(Soundex)','Matched(Cosine)','Missed(Cosine)']

In [122]:
# Finding TP% under each Theme for all Algorithms
for i in range(len(MergedTP)):
    MergedTP.loc[i, 'TP%(DoubleMetaphone)'] = round(MergedTP['Matched(DoubleMetaphone)'][i]/MergedTP['# of TP(Obtained)'][i]*100,2)
    MergedTP.loc[i, 'TP%(Soundex)'] = round(MergedTP['Matched(Soundex)'][i]/MergedTP['# of TP(Obtained)'][i]*100,2)
    MergedTP.loc[i, 'TP%(Cosine)'] = round(MergedTP['Matched(Cosine)'][i]/MergedTP['# of TP(Obtained)'][i]*100, 2)

In [124]:
# Merging the Volume of Hits and TP% dataframes
Metrics = pd.merge(MergedVOHs, MergedTP, on='Theme',how='left')

In [155]:
Metrics = Metrics.reindex(columns=['Theme','# of TP(Obtained)','Matched(DoubleMetaphone)','Missed(DoubleMetaphone)','TP%(DoubleMetaphone)','VOHs(DoubleMetaphone)','Matched(Soundex)','Missed(Soundex)','TP%(Soundex)','VOHs(Soundex)','Matched(Cosine)','Missed(Cosine)','TP%(Cosine)','VOHs(Cosine)'])

In [156]:
Metrics

Unnamed: 0,Theme,# of TP(Obtained),Matched(DoubleMetaphone),Missed(DoubleMetaphone),TP%(DoubleMetaphone),VOHs(DoubleMetaphone),Matched(Soundex),Missed(Soundex),TP%(Soundex),VOHs(Soundex),Matched(Cosine),Missed(Cosine),TP%(Cosine),VOHs(Cosine)
0,Combinations,45,21.0,24,46.67,2520,23.0,22,51.11,4153,18.0,27,40.0,165
1,Different Name Parts,28,11.0,17,39.29,2983,11.0,17,39.29,4384,11.0,17,39.29,96
2,Foreign Languages,2,,2,,0,,2,,12,,2,,0
3,IDs,27,,27,,54,,27,,149,,27,,0
4,Modified Order,54,44.0,10,81.48,2245,44.0,10,81.48,3858,37.0,17,68.52,130
5,Name Additions,68,55.0,13,80.88,3831,55.0,13,80.88,5962,48.0,20,70.59,108
6,Name Deletions,38,23.0,15,60.53,2363,23.0,15,60.53,4471,22.0,16,57.89,51
7,Name Part Variations,56,45.0,11,80.36,2023,45.0,11,80.36,3247,48.0,8,85.71,279
8,Names where Name Parts are Modified,211,162.0,49,76.78,4944,162.0,49,76.78,10502,156.0,55,73.93,679
9,Names where name parts are Modified,3,,3,,273,,3,,302,,3,,3


In [157]:
Metrics.to_excel('Metrics.xlsx')

In [153]:
Individuals.to_excel('Individuals.xlsx')