In [2]:
import numpy as np
import pandas as pd
import nltk
from fuzzywuzzy import fuzz
from nltk.metrics import jaccard_distance
from numpy import save



<h3>Functions</h33>

In [2]:
#function that uses fuzzy matching against two lists
#takes two arguments: df1.columnName.tolist() as l1 and df2.columnName.tolist() as l2
#This function does two things. First, it print the current word in list 1 and then all the words its being matched to, along with the fuzzy ratio score
#Second, it stores each score in an array and returns the array 

def matchWithFuzzy(l1, l2):
    score = []
    for i in l1:
        print("\nWord in list 1 {} matches:".format(i))
        for j in l2:
            score.append(fuzz.ratio(i, j))
            print("{} with a score of {}".format(j,fuzz.ratio(i,j)))
    
    return score

In [3]:
#Function that creates a matrix of fuzzy scores
#l1 and l2 are the lists
#nrows will be the number of rows you will need in the matrix
#ncols will be the number of columns you will need


#First try to incorporate pandas and numpy; the final output should be a pandas dataframe that is a matrix. If the code is still too slow, then break up the dna dataset
#into three 20000x60000 have three seperate jupyters running, using this function for each third of the data(dna), and get 
#put write.csv into function(will have three csvs, which will need to combined)

#try to get a frequency distribution of the matches


def matrix_score(l1, l2, file_name):
    #creates a matrix of zeros with the dimensions specified from the parameters
    #So if you had a list that had 5 elements in it, your matrix would be a 5x5 matrix; nrows would be 5 and ncols would be 5
    #matrix = np.zeros(shape = (nrows, ncols))
    matrix = np.zeros(shape = (len(l1), len(l2)))
    
    #initialize row and col values to 0
    row = 0
    col = 0
    
    for i in l1:
        if row % (len(l1)//10) == 0:       
            print("progress: %.2f" % (100*row/len(l1)) + "%")
        for j in l2:
            score = fuzz.ratio(i,j)
            matrix[row][col] = score
            col+=1
        col = 0
        row+=1
    
    save(file_name, matrix)
    
    return matrix 

In [4]:
#Isabels function
def matchWithFuzzyNames(l1, l2, original):
    match = pd.DataFrame(columns = ['original company names', 'clean company name', 'company matches', 'fuzz ratio'])
    for i in l1:
        matches = []
        score = []
        score_partial = []
        for j in l2:
            if (fuzz.ratio(i,j)) == 100:
                matches.append(j)
                score.append(fuzz.ratio(i,j))
                score_partial.append((fuzz.partial_ratio(i,j)))
        match = match.append({ 'clean company name': i,'company matches': matches, 'fuzz ratio': score, 'fuzz partial ratio': score_partial},ignore_index=True)
        
    match['original company names'] = original
        

 


    return match

In [5]:
#This function takes the matrix and list of cleaned company names as arguments and returns a dataframe representing the martrix passed
#ls should be the cleaned company names of the second company you passed into matrix_score
#so if you were checking the matrix score of matrix_score(x.clean.tolist(), y.clean.tolist(), 10, 5), then you would use convertMatrixToDf(matrix, y.clean.tolist())
def convertMatrixToDf(matrix, ls):
    #In order for the code to work, the matrix passed must first be converted from float to int
    matrix = matrix.astype(int)
    #Then the matrix needs to be converted to str
    matrix = matrix.astype(str)
    cleaned_matrix = [[int(item.split()[0]) for item in row] for row in matrix]
    matrix_scores = pd.DataFrame(cleaned_matrix, columns = ls)
    
    #returns the matrix as a dataframe, with type int
    return matrix_scores.astype(int)

In [6]:
#This function works just like pd.read_csv(), but where pd.read_csv() adds numbers to duplicated column names, this function doesn't
#path is the file path of the csv you want to open
def readCsv(path):
    df = pd.read_csv(path, index_col = [0])
    df.columns = df.columns.str.split(".").str[0]
    return df

<h2>NDC Matching</h2>

In [3]:
ndc = pd.read_csv("../data/working/ndc_clean.csv", index_col = [0])

In [6]:
ndc.tail()

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7018,Zydus Pharmaceuticals USA Inc,7043,zydus
7019,Zydus Pharmaceuticals USA Inc.,7044,zydus
7020,Zydus Technologies Limited,7045,zydus
7021,ZYGONE,7046,zygone
7022,Zyla Life Sciences US Inc.,7047,zyla life sciences


In [8]:
ndc.head()

Unnamed: 0_level_0,original_company,originalRow,cleaned_name
row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,SPIRONOLACTONE 2%,25,spironolactone
1,-L'Oreal USA Products Inc,26,-loreal
2,.Cardinal Health,27,cardinal
3,.Church & Dwight Canada Corp,28,church&dwight canada
4,{Preferred Pharmaeutials INC.,29,preferred pharmaeutials


In [9]:
#After cleaning, some companies were removed entirely. This left some na's that need to be dealt with
ndc.dropna(inplace = True)
#reseting the index will renumber the new rows after dropping the index
ndc.reset_index(inplace = True)

In [10]:
#assings the original uncleaned names from ndc to new_original_company
new_original_company = ndc['original_company']

In [20]:
#Creates an 6717x6717 identity matrix of ndc against itself and stores it in matrix
matrix = matrix_score(ndc.cleaned_name.tolist(), ndc.cleaned_name.tolist(), 6717, 6717)

progress: 0.00%
progress: 9.99%
progress: 19.98%
progress: 29.97%
progress: 39.96%
progress: 49.95%
progress: 59.94%
progress: 69.93%
progress: 79.92%
progress: 89.91%
progress: 99.90%


In [24]:
#Viewing the ndc matrix
matrix

array([[100.,  19.,  27., ...,  11.,  30.,  31.],
       [ 29., 100.,  40., ...,   0.,  31.,  16.],
       [ 18.,  40., 100., ...,  15.,  14.,  23.],
       ...,
       [ 11.,   0.,  15., ..., 100.,  36.,  26.],
       [ 30.,  31.,  14., ...,  36., 100.,  33.],
       [ 19.,  24.,   8., ...,  26.,  25., 100.]])

In [39]:
#Takes the matrix above and converts it to a csv, where the names of each cleaned company are the column headers
matrixDf = convertMatrixToDf(matrix, ndc['cleaned_name'].tolist())

In [41]:
#Viewing the last 5 rows of the matrixDf
matrixDf.tail()

Unnamed: 0,spironolactone,-loreal,cardinal,church&dwight canada,preferred pharmaeutials,†wal-mart stores†,veterans,medco,st class,st medx,...,zoono,zoono.1,zrg detox,zunyi jici bio-health,zydus,zydus.1,zydus.2,zydus.3,zygone,zyla life sciences
6712,11,0,15,8,21,9,15,20,15,17,...,20,20,29,15,100,100,100,100,36,26
6713,11,0,15,8,21,9,15,20,15,17,...,20,20,29,15,100,100,100,100,36,26
6714,11,0,15,8,21,9,15,20,15,17,...,20,20,29,15,100,100,100,100,36,26
6715,30,31,14,15,7,17,14,18,0,15,...,55,55,40,30,36,36,36,36,100,33
6716,19,24,8,11,20,34,15,17,31,16,...,17,17,22,36,26,26,26,26,25,100


In [42]:
#writing the matrix dataframe to working
matrixDf.to_csv("../data/working/matrix_ndc.csv")

In [11]:
#Running Isabels code
#The output will be a dataframe with the original company name, the cleaned company name, all of the cleaned company name's corresponding matches, 
#the fuzz ratio, and the partial fuzz ratio for ndc companies

ndc_matching = matchWithFuzzyNames(ndc['cleaned_name'].tolist(), ndc['cleaned_name'].tolist(), new_original_company)

In [12]:
ndc_matching.head()

Unnamed: 0,original company names,clean company name,company matches,fuzz ratio,fuzz partial ratio
0,SPIRONOLACTONE 2%,spironolactone,[spironolactone],[100],[100]
1,-L'Oreal USA Products Inc,-loreal,[-loreal],[100],[100]
2,.Cardinal Health,cardinal,"[cardinal, cardinal, cardinal, cardinal, cardi...","[100, 100, 100, 100, 100, 100, 100, 100, 100, ...","[100, 100, 100, 100, 100, 100, 100, 100, 100, ..."
3,.Church & Dwight Canada Corp,church&dwight canada,"[church&dwight canada, church&dwight canada, c...","[100, 100, 100]","[100, 100, 100]"
4,{Preferred Pharmaeutials INC.,preferred pharmaeutials,[preferred pharmaeutials],[100],[100]


<h2>FDA Matching</h2>

In [13]:
fda = pd.read_csv("../data/working/fda_clean.csv", index_col = [0])

In [14]:
fda.dropna(inplace = True)
fda.reset_index(inplace = True)
new_fda_original = fda['FDA Companies ']

In [15]:
fda.shape

(973, 3)

In [49]:
fda_matrix = matrix_score(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), 973, 973)

progress: 0.00%
progress: 9.95%
progress: 19.90%
progress: 29.85%
progress: 39.79%
progress: 49.74%
progress: 59.69%
progress: 69.64%
progress: 79.59%
progress: 89.54%
progress: 99.49%


In [51]:
fda_matrix_df = convertMatrixToDf(fda_matrix, fda['Company Clean'].tolist())

In [52]:
fda_matrix_df.head()

Unnamed: 0,d imaging drug,m,m drug delivery,aaipharma,abbott,abbvie,abbvie endocrine,abbvie endocrine.1,abbvie.1,abhai,...,yabao,yaopharma,yung shin,zambon,zevacor,zo skin,zydus,zydus.1,zydus.2,zydus worldwide
0,100,13,41,26,10,10,20,20,10,11,...,11,17,26,20,19,29,21,21,21,28
1,13,100,12,20,0,0,0,0,0,0,...,0,20,0,29,0,0,0,0,0,0
2,41,12,100,8,0,10,26,26,10,10,...,10,8,33,10,9,18,20,20,20,27
3,26,20,17,100,13,27,24,24,27,43,...,29,78,11,27,25,12,0,0,0,8
4,10,0,0,13,100,50,36,36,50,36,...,55,27,0,50,31,15,0,0,0,10


In [16]:
matchFDA = matchWithFuzzyNames(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), new_fda_original)

In [17]:
matchFDA.tail(50)

Unnamed: 0,original company names,clean company name,company matches,fuzz ratio,fuzz partial ratio
923,VIFOR FRESENIUS,vifor fresenius,[vifor fresenius],[100],[100]
924,VIIV HLTHCARE,viiv,[viiv],[100],[100]
925,VINTAGE,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
926,VINTAGE PHARMS,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
927,VINTAGE PHARMS LLC,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
928,VIRTUS PHARM,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
929,VIRTUS PHARMS,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
930,VISTAPHARM,vistapharm,[vistapharm],[100],[100]
931,VIVA HLTHCARE,viva,[viva],[100],[100]
932,VIVIMED GLOBAL,vivimed global,[vivimed global],[100],[100]


<h3>DNA Matrix subset (rows 21318:42636)</h3>

In [7]:
dna = pd.read_csv("../data/working/dna_clean.csv", index_col = [0])

In [8]:
dna.shape

(64005, 4)

In [9]:
dna.dropna(inplace = True)

In [10]:
del dna['Unnamed: 0.1']

In [11]:
del dna['Code']

In [12]:
dna

Unnamed: 0,Description,cleaned_companies
0,AA PLC,aa
1,"Emperial Americas, Inc.",emperial americas
2,"American Academy of Allergy, Asthma and Immuno...",academy allergy asthma immunology
3,Bird Studies Canada,bird studies canada
4,Aesculap AG & Co. KG,aesculap&co
...,...,...
64000,Boost Brent Oil 3x Leverage Daily ETP,boost brent oil x leverage daily etp
64001,Boost Brent Oil 3x Short Daily ETP,boost brent oil x short daily etp
64002,Nuveen High Income November 2021 Target Term Fund,nuveen high income november target term fund
64003,SPDR SSGA Gender Diversity Index ETF,spdr ssga gender diversity index etf


In [13]:
subset = dna.iloc[21318:42636]

In [14]:
subset.tail()

Unnamed: 0,Description,cleaned_companies
42673,NanoSphere Health Sciences Inc.,nanosphere sciences
42674,Omnicomm Systems Inc,omnicomm
42675,OmniAmerican Credit Union,omniamerican credit union
42676,1Malaysia Development Bhd,malaysia development
42677,OMD Chicago Inc.,omd chicago


In [15]:
#Here I am just creating a new column numbered 0 to 21318; Now the index goes from 0 to 21318 and I can split this in half (i.e 0-10659, 10659:)
subset['row'] = np.arange(0, len(subset))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
#break subset into halves
first_half = subset.iloc[0:10659]
second_half = subset.iloc[10659:]

In [17]:
#My portion ends at OMD Chicago Inc. Isabel's subset picks up with the next company 
second_half.tail()

Unnamed: 0,Description,cleaned_companies,row
42673,NanoSphere Health Sciences Inc.,nanosphere sciences,21313
42674,Omnicomm Systems Inc,omnicomm,21314
42675,OmniAmerican Credit Union,omniamerican credit union,21315
42676,1Malaysia Development Bhd,malaysia development,21316
42677,OMD Chicago Inc.,omd chicago,21317


In [18]:
second_half_matrix = matrix_score(second_half['cleaned_companies'].tolist(), dna['cleaned_companies'].tolist(), '../data/working/dnaMatrix2b')

progress: 0.00%
progress: 19.98%
progress: 29.97%
progress: 39.97%
progress: 49.96%
progress: 59.95%
progress: 69.94%
progress: 79.93%
progress: 89.92%


<h4>FDA x NDC</h4>

In [18]:
#Getting unique clean companies from ndc_matching (see ndc matrix section above to see where this variable came from)
ndc_unique_clean = ndc_matching['clean company name']
#drops any duplicates so that the result is a series of unique companies that matched 100%
ndc_unique_clean.drop_duplicates(inplace = True)
#convert series to a df
ndc_unique_clean = pd.DataFrame(ndc_unique_clean)
#renaming column
ndc_unique_clean = ndc_unique_clean.rename(columns = {"clean company name": "ndc clean name"})
#reset index so numbers are in order
ndc_unique_clean.reset_index(inplace = True)
#delete index column
del ndc_unique_clean['index']

In [19]:
ndc_unique_clean

Unnamed: 0,ndc clean name
0,spironolactone
1,-loreal
2,cardinal
3,church&dwight canada
4,preferred pharmaeutials
...,...
5204,zrg detox
5205,zunyi jici bio-health
5206,zydus
5207,zygone


In [20]:
#Getting unique clean companies from fda (see fda matrix section above to see where this variable came from)
fda_unique_clean = matchFDA['clean company name']
fda_unique_clean.drop_duplicates(inplace = True)
fda_unique_clean = pd.DataFrame(fda_unique_clean)
fda_unique_clean = fda_unique_clean.rename(columns = {"clean company name": "fda clean name"})
fda_unique_clean.reset_index(inplace = True)
del fda_unique_clean['index']

In [21]:
fda_unique_clean

Unnamed: 0,fda clean name
0,d imaging drug
1,m
2,m drug delivery
3,aaipharma
4,abbott
...,...
791,zambon
792,zevacor
793,zo skin
794,zydus


In [94]:
#creates a 796 x 5209 matrix
fdaAgainstndc = matrix_score(fda_unique_clean['fda clean name'].tolist(), ndc_unique_clean['ndc clean name'].tolist(), 796, 5209)

progress: 0.00%
progress: 9.92%
progress: 19.85%
progress: 29.77%
progress: 39.70%
progress: 49.62%
progress: 59.55%
progress: 69.47%
progress: 79.40%
progress: 89.32%
progress: 99.25%


In [95]:
fdaAgainstndc

array([[ 21.,  10.,  27., ...,  21.,  20.,  25.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [ 14.,  27.,  17., ...,  20.,  19.,  24.],
       ...,
       [ 19.,  14.,  27., ...,  33.,  46.,  40.],
       [ 11.,   0.,  15., ..., 100.,  36.,  26.],
       [ 28.,  27.,  17., ...,  50.,  38.,  30.]])

In [96]:
#converts to a dataframe
#ndc_unique_clean is being passed because the columns of the matrix are the ndc cleaned companies (796 x 5209 matrix)
fdaAgainstndcmatrixDf = convertMatrixToDf(fdaAgainstndc, ndc_unique_clean['clean company name'].tolist())

In [99]:
#writes csv to working
fdaAgainstndcmatrixDf.to_csv("../data/working/matrix_fdaxndc.csv")

In [22]:
#Modifying Isabel's code a bit
#Outputs the clean name of both datasets, and the fuzz ratio

def matchWithFuzz(l1,l2):
    clean_fda = []
    clean_ndc = []
    score = []
    match = pd.DataFrame(columns = ['clean fda', 'clean ndc', 'fuzz ratio'])
    for i in l1:
        #matches = []
        #score = []
        for j in l2:
            if fuzz.ratio(i,j) == 100:
                #matches.append(j)
                score.append(fuzz.ratio(i,j))
                clean_fda.append(i)
                clean_ndc.append(j)
        #match = match.append({'company matches': matches}, ignore_index = True)
    match['clean fda'] = clean_fda
    match['clean ndc'] = clean_ndc
    match['fuzz ratio'] = score
    return match

In [23]:
x = matchWithFuzz(fda_unique_clean['fda clean name'].tolist(), ndc_unique_clean['ndc clean name'].tolist())

In [216]:
x.head(50)
#Obviously not a whole lot here, but it does show which companies are in both datasets that match 100%

Unnamed: 0,clean fda,clean ndc,fuzz ratio
0,m,m,100
1,abbvie,abbvie,100
2,abraxis bioscience,abraxis bioscience,100
3,accord,accord,100
4,aci,aci,100
5,actavis,actavis,100
6,actelion,actelion,100
7,adapt,adapt,100
8,aegerion,aegerion,100
9,agouron,agouron,100


In [25]:
print("There are {} companies in both ndc and fda that match 100%".format(len(x)))

There are 356 companies in both ndc and fda that match 100%
