In [1]:
import numpy as np
import pandas as pd
import nltk
from fuzzywuzzy import fuzz
from nltk.metrics import jaccard_distance



<h4>Functions</h4>

In [2]:
#function that uses fuzzy matching against two lists
#takes two arguments: df1.columnName.tolist() as l1 and df2.columnName.tolist() as l2
#This function does two things. First, it print the current word in list 1 and then all the words its being matched to, along with the fuzzy ratio score
#Second, it stores each score in an array and returns the array 

def matchWithFuzzy(l1, l2):
    score = []
    for i in l1:
        print("\nWord in list 1 {} matches:".format(i))
        for j in l2:
            score.append(fuzz.ratio(i, j))
            print("{} with a score of {}".format(j,fuzz.ratio(i,j)))
    
    return score

In [3]:
#Function that creates a matrix of fuzzy scores
#l1 and l2 are the lists
#nrows will be the number of rows you will need in the matrix
#ncols will be the number of columns you will need


#First try to incorporate pandas and numpy; the final output should be a pandas dataframe that is a matrix. If the code is still too slow, then break up the dna dataset
#into three 20000x60000 have three seperate jupyters running, using this function for each third of the data(dna), and get 
#put write.csv into function(will have three csvs, which will need to combined)

#try to get a frequency distribution of the matches


def matrix_score(l1, l2, nrows, ncols):
    #creates a matrix of zeros with the dimensions specified from the parameters
    #So if you had a list that had 5 elements in it, your matrix would be a 5x5 matrix; nrows would be 5 and ncols would be 5
    matrix = np.zeros(shape = (nrows, ncols))
    
    #initialize row and col values to 0
    row = 0
    col = 0
    
    for i in l1:
        if row % (len(l1)//10) == 0:       
            print("progress: %.2f" % (100*row/len(l1)) + "%")
        for j in l2:
            score = fuzz.ratio(i,j)
            matrix[row][col] = score
            col+=1
        col = 0
        row+=1
    
    return matrix 

In [4]:
#Isabels function
def matchWithFuzzyNames(l1, l2, original):
    match = pd.DataFrame(columns = ['original company names', 'clean company name', 'company matches', 'fuzz ratio'])
    for i in l1:
        matches = []
        score = []
        score_partial = []
        for j in l2:
            if (fuzz.ratio(i,j)) == 100:
                matches.append(j)
                score.append(fuzz.ratio(i,j))
                score_partial.append((fuzz.partial_ratio(i,j)))
        match = match.append({ 'clean company name': i,'company matches': matches, 'fuzz ratio': score, 'fuzz partial ratio': score_partial},ignore_index=True)
        
    match['original company names'] = original
        

 


    return match

In [5]:
#This function takes the matrix and list of cleaned company names as arguments and returns a dataframe representing the martrix passed
#ls should be the cleaned company names of the second company you passed into matrix_score
#so if you were checking the matrix score of matrix_score(x.clean.tolist(), y.clean.tolist(), 10, 5), then you would use convertMatrixToDf(matrix, y.clean.tolist())
def convertMatrixToDf(matrix, ls):
    #In order for the code to work, the matrix passed must first be converted from float to int
    matrix = matrix.astype(int)
    #Then the matrix needs to be converted to str
    matrix = matrix.astype(str)
    cleaned_matrix = [[int(item.split()[0]) for item in row] for row in matrix]
    matrix_scores = pd.DataFrame(cleaned_matrix, columns = ls)
    
    #returns the matrix as a dataframe, with type int
    return matrix_scores.astype(int)

In [6]:
#This function works just like pd.read_csv(), but where pd.read_csv() adds numbers to duplicated column names, this function doesn't
#path is the file path of the csv you want to open
def readCsv(path):
    df = pd.read_csv(path, index_col = [0])
    df.columns = df.columns.str.split(".").str[0]
    return df

<h3>NDC Matrix</h3>

In [7]:
ndc = pd.read_csv("../data/working/ndc_clean.csv", index_col = [0])

In [8]:
ndc.head()

Unnamed: 0_level_0,original_company,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1
0,SPIRONOLACTONE 2%,spironolactone
1,-L'Oreal USA Products Inc,-loreal
2,.Cardinal Health,cardinal
3,.Church & Dwight Canada Corp,church&dwight canada
4,{Preferred Pharmaeutials INC.,preferred pharmaeutials


In [9]:
ndc.dropna(inplace = True)
#ndc.reset_index(inplace = True)

In [30]:
new_original_company = ndc['original_company']

In [31]:
new_original_company.tail(10)

6707                                Zoono USA
6708                           ZOONO USA, LLC
6709                          ZRG DETOX, INC.
6710    Zunyi Jici Bio-Health Products Co Ltd
6711         Zydus Pharmaceuticals (USA) Inc.
6712            Zydus Pharmaceuticals USA Inc
6713           Zydus Pharmaceuticals USA Inc.
6714               Zydus Technologies Limited
6715                                   ZYGONE
6716               Zyla Life Sciences US Inc.
Name: original_company, dtype: object

In [58]:
#del ndc['row']

In [10]:
ndc.tail()

Unnamed: 0_level_0,original_company,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1
7018,Zydus Pharmaceuticals USA Inc,zydus
7019,Zydus Pharmaceuticals USA Inc.,zydus
7020,Zydus Technologies Limited,zydus
7021,ZYGONE,zygone
7022,Zyla Life Sciences US Inc.,zyla life sciences


In [11]:
ndc.columns

Index(['original_company', 'cleaned_name'], dtype='object')

In [11]:
len(ndc)

6717

In [12]:
matrix = matrix_score(ndc.cleaned_name.tolist(), ndc.cleaned_name.tolist(), 6717, 6717)

progress: 0.00%
progress: 9.99%
progress: 19.98%
progress: 29.97%
progress: 39.96%
progress: 49.95%
progress: 59.94%
progress: 69.93%
progress: 79.92%


KeyboardInterrupt: 

In [13]:
matrix

array([[100.,  19.,  27., ...,  11.,  30.,  31.],
       [ 29., 100.,  40., ...,   0.,  31.,  16.],
       [ 18.,  40., 100., ...,  15.,  14.,  23.],
       ...,
       [ 11.,   0.,  15., ..., 100.,  36.,  26.],
       [ 30.,  31.,  14., ...,  36., 100.,  33.],
       [ 19.,  24.,   8., ...,  26.,  25., 100.]])

In [14]:
matrixDf = convertMatrixToDf(matrix, ndc['cleaned_name'].tolist())

In [15]:
matrixDf.head()

Unnamed: 0,spironolactone,-loreal,cardinal,church&dwight canada,preferred pharmaeutials,†wal-mart stores†,veterans,medco,st class,st medx,...,zoono,zoono.1,zrg detox,zunyi jici bio-health,zydus,zydus.1,zydus.2,zydus.3,zygone,zyla life sciences
0,100,19,27,18,5,26,9,11,27,29,...,32,32,26,23,11,11,11,11,30,31
1,29,100,40,15,27,33,27,17,27,14,...,17,17,12,29,0,0,0,0,31,16
2,18,40,100,29,32,16,25,15,25,13,...,15,15,24,28,15,15,15,15,14,23
3,18,15,29,100,19,16,21,8,29,22,...,8,8,21,15,8,8,8,8,15,11
4,16,27,32,28,100,15,32,14,19,13,...,0,0,19,14,21,21,21,21,7,24


In [16]:
matrixDf.to_csv("../data/working/matrix_ndc.csv")

In [32]:
#Running Isabels code
ndc_matching = matchWithFuzzyNames(ndc['cleaned_name'].tolist(), ndc['cleaned_name'].tolist(), new_original_company)

In [33]:
#cleaned      #all the matches       #row number
#zee          [zee,zee]              4,90


#then match ndc clean companies against fda 
ndc_matching.tail(50)

Unnamed: 0,original company names,clean company name,company matches,fuzz ratio,fuzz partial ratio
6667,Zapne LLC,zapne,[zapne],[100],[100]
6668,Zapotol Corp.,zapotol,[zapotol],[100],[100]
6669,ZBM Natural Medicine Ltd.,zbm natural medicine,[zbm natural medicine],[100],[100]
6670,Zee Medical,zee,"[zee, zee]","[100, 100]","[100, 100]"
6671,Zee Medical Inc,zee,"[zee, zee]","[100, 100]","[100, 100]"
6672,ZELENS Limited,zelens,[zelens],[100],[100]
6673,Zen Enterprises LLC,zen enterprises,[zen enterprises],[100],[100]
6674,"Zeniton Co.,Ltd.",zeniton,[zeniton],[100],[100]
6675,"Zenkosmetikos Co., Ltd.",zenkosmetikos,[zenkosmetikos],[100],[100]
6676,Zenpia,zenpia,[zenpia],[100],[100]


<h3>FDA Matrix</h3>

In [36]:
fda = pd.read_csv("../data/working/fda_clean.csv", index_col = [0])

In [37]:
fda.columns

Index(['FDA Companies ', 'Company Clean'], dtype='object')

In [40]:
fda.dropna(inplace = True)
fda.reset_index(inplace = True)
new_fda_original = fda['FDA Companies ']

In [41]:
fda.shape

(973, 4)

In [49]:
fda_matrix = matrix_score(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), 973, 973)

progress: 0.00%
progress: 9.95%
progress: 19.90%
progress: 29.85%
progress: 39.79%
progress: 49.74%
progress: 59.69%
progress: 69.64%
progress: 79.59%
progress: 89.54%
progress: 99.49%


In [None]:
fda_matrix

In [51]:
fda_matrix_df = convertMatrixToDf(fda_matrix, fda['Company Clean'].tolist())

In [52]:
fda_matrix_df.head()

Unnamed: 0,d imaging drug,m,m drug delivery,aaipharma,abbott,abbvie,abbvie endocrine,abbvie endocrine.1,abbvie.1,abhai,...,yabao,yaopharma,yung shin,zambon,zevacor,zo skin,zydus,zydus.1,zydus.2,zydus worldwide
0,100,13,41,26,10,10,20,20,10,11,...,11,17,26,20,19,29,21,21,21,28
1,13,100,12,20,0,0,0,0,0,0,...,0,20,0,29,0,0,0,0,0,0
2,41,12,100,8,0,10,26,26,10,10,...,10,8,33,10,9,18,20,20,20,27
3,26,20,17,100,13,27,24,24,27,43,...,29,78,11,27,25,12,0,0,0,8
4,10,0,0,13,100,50,36,36,50,36,...,55,27,0,50,31,15,0,0,0,10


In [43]:
x = matchWithFuzzyNames(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), new_fda_original)

In [44]:
x.tail(50)

Unnamed: 0,original company names,clean company name,company matches,fuzz ratio,fuzz partial ratio
923,VIFOR FRESENIUS,vifor fresenius,[vifor fresenius],[100],[100]
924,VIIV HLTHCARE,viiv,[viiv],[100],[100]
925,VINTAGE,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
926,VINTAGE PHARMS,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
927,VINTAGE PHARMS LLC,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
928,VIRTUS PHARM,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
929,VIRTUS PHARMS,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
930,VISTAPHARM,vistapharm,[vistapharm],[100],[100]
931,VIVA HLTHCARE,viva,[viva],[100],[100]
932,VIVIMED GLOBAL,vivimed global,[vivimed global],[100],[100]


<h3>DNA Matrix subset</h3>

In [85]:
dna = pd.read_csv("../data/working/dna_clean.csv", index_col = [0])

In [86]:
dna.shape

(64005, 4)

In [87]:
dna.fillna("-",inplace = True)

In [88]:
del dna['Unnamed: 0.1']

In [89]:
del dna['Code']

In [92]:
dna

Unnamed: 0,Description,cleaned_companies
0,AA PLC,aa
1,"Emperial Americas, Inc.",emperial americas
2,"American Academy of Allergy, Asthma and Immuno...",academy allergy asthma immunology
3,Bird Studies Canada,bird studies canada
4,Aesculap AG & Co. KG,aesculap&co
...,...,...
64000,Boost Brent Oil 3x Leverage Daily ETP,boost brent oil x leverage daily etp
64001,Boost Brent Oil 3x Short Daily ETP,boost brent oil x short daily etp
64002,Nuveen High Income November 2021 Target Term Fund,nuveen high income november target term fund
64003,SPDR SSGA Gender Diversity Index ETF,spdr ssga gender diversity index etf


In [129]:
first= dna.iloc[0:4267]
second = dna.iloc[4267:8534]
third = dna.iloc[8534:12801]
fourth = dna.iloc[12801:17068]
fifth = dna.iloc[17068:21335]
sixth = dna.iloc[21335:25602]
seventh = dna.iloc[25602:29869]
eighth = dna.iloc[29869:34136]
ninth = dna.iloc[34136:38403]
tenth = dna.iloc[38403:42670]
eleventh = dna.iloc[42670:46937]
twelth = dna.iloc[46937:51204]
thirteenth = dna.iloc[51204:55471]
fourteenth = dna.iloc[55471:59738]
fifteenth = dna.iloc[59738:]

In [115]:
matrix_dna_1 = matrix_score(first['cleaned_companies'].tolist(), first['cleaned_companies'].tolist(), 4267, 64005)

progress: 0.00%
progress: 9.98%
progress: 19.97%
progress: 29.95%
progress: 39.93%
progress: 49.92%
progress: 59.90%
progress: 69.89%
progress: 79.87%
progress: 89.85%
progress: 99.84%


In [117]:
x = convertMatrixToDf(matrix_dna_1, dna['cleaned_companies'].tolist())

In [156]:
x.to_csv("../data/working/dna_matrix_1.csv")

KeyboardInterrupt: 

21318

In [72]:
#3553
partOne = dna_subset[0:3553]
partTwo = dna_subset[3553:7016]
partThree = dna_subset[7016:10659]
partFour = dna_subset[10659:14212]
partFive = dna_subset[14212:17765]
partSix = dna_subset[17765:21319]

In [73]:
partSix.tail()

Unnamed: 0,Unnamed: 0.1,Code,Description,cleaned_companies,row
42631,70581,OLOG,Bristow Group Inc.,bristow,21313
42632,70584,OLONII,Olon Industries Inc,olon industries,21314
42633,70586,OLOUNZ,eSolutions Inc,esolutions,21315
42634,70587,OLPAEC,Oliver Packaging and Equipment Company,oliver packaging equipment,21316
42635,70588,OLPART,Olympus Partners,olympus,21317


In [81]:
dna_subset_matrix = matrix_score(partOne['cleaned_companies'].tolist(), dna['cleaned_companies'].tolist(), 3553,63954)

KeyboardInterrupt: 

In [49]:
dna_subset_matrix

NameError: name 'dna_subset_matrix' is not defined