In [4]:
import numpy as np
import pandas as pd
import nltk
from fuzzywuzzy import fuzz
from nltk.metrics import jaccard_distance



<h4>Functions</h4>

In [190]:
#function that uses fuzzy matching against two lists
#takes two arguments: df1.columnName.tolist() as l1 and df2.columnName.tolist() as l2
#This function does two things. First, it print the current word in list 1 and then all the words its being matched to, along with the fuzzy ratio score
#Second, it stores each score in an array and returns the array 

def matchWithFuzzy(l1, l2):
    score = []
    for i in l1:
        print("\nWord in list 1 {} matches:".format(i))
        for j in l2:
            score.append(fuzz.ratio(i, j))
            print("{} with a score of {}".format(j,fuzz.ratio(i,j)))
    
    return score

In [1]:
#Function that creates a matrix of fuzzy scores
#l1 and l2 are the lists
#nrows will be the number of rows you will need in the matrix
#ncols will be the number of columns you will need


#First try to incorporate pandas and numpy; the final output should be a pandas dataframe that is a matrix. If the code is still too slow, then break up the dna dataset
#into three 20000x60000 have three seperate jupyters running, using this function for each third of the data(dna), and get 
#put write.csv into function(will have three csvs, which will need to combined)

#try to get a frequency distribution of the matches


def matrix_score(l1, l2, nrows, ncols):
    #creates a matrix of zeros with the dimensions specified from the parameters
    #So if you had a list that had 5 elements in it, your matrix would be a 5x5 matrix; nrows would be 5 and ncols would be 5
    matrix = np.zeros(shape = (nrows, ncols))
    
    #initialize row and col values to 0
    row = 0
    col = 0
    
    for i in l1:
        for j in l2:
            score = fuzz.ratio(i,j)
            matrix[row][col] = score
            col+=1
        col = 0
        row+=1
    
    return matrix 

In [29]:
#Isabels function
def matchWithFuzzyNames(l1, l2, original):
    match = pd.DataFrame(columns = ['original company names', 'clean company name', 'company matches', 'fuzz ratio'])
    for i in l1:
        matches = []
        score = []
        score_partial = []
        for j in l2:
            if (fuzz.ratio(i,j)) == 100:
                matches.append(j)
                score.append(fuzz.ratio(i,j))
                score_partial.append((fuzz.partial_ratio(i,j)))
        match = match.append({ 'clean company name': i,'company matches': matches, 'fuzz ratio': score, 'fuzz partial ratio': score_partial},ignore_index=True)
        
    match['original company names'] = original
        

 


    return match

In [2]:
#This function takes the matrix and list of cleaned company names as arguments and returns a dataframe representing the martrix passed
#ls should be the cleaned company names of the second company you passed into matrix_score
#so if you were checking the matrix score of matrix_score(x.clean.tolist(), y.clean.tolist(), 10, 5), then you would use convertMatrixToDf(matrix, y.clean.tolist())
def convertMatrixToDf(matrix, ls):
    #In order for the code to work, the matrix passed must first be converted from float to int
    matrix = matrix.astype(int)
    #Then the matrix needs to be converted to str
    matrix = matrix.astype(str)
    cleaned_matrix = [[int(item.split()[0]) for item in row] for row in matrix]
    matrix_scores = pd.DataFrame(cleaned_matrix, columns = ls)
    
    #returns the matrix as a dataframe, with type int
    return matrix_scores.astype(int)

In [None]:
#This function works just like pd.read_csv(), but where pd.read_csv() adds numbers to duplicated column names, this function doesn't
#path is the file path of the csv you want to open
def readCsv(path):
    df = pd.read_csv(path, index_col = [0])
    df.columns = df.columns.str.split(".").str[0]
    return df

<h3>NDC Matrix</h3>

In [92]:
ndc = pd.read_csv("../data/working/cleanNDC.csv", index_col = [0])

<h6>Drop any rows that had NaN</h6>

In [49]:
ndc.fillna("-",inplace = True)

NameError: name 'ndc' is not defined

In [132]:
ndc.shape

(6718, 2)

In [138]:
ndc.columns

Index(['original_company', 'cleaned_name'], dtype='object')

In [134]:
matrix = matrix_score(ndc.cleaned_name.tolist(), ndc.cleaned_name.tolist(), 6718, 6718)

In [165]:
matrix

array([[100.,  17.,  25., ...,  27.,  24.,  30.],
       [ 26., 100.,  40., ...,  31.,  16.,  18.],
       [ 17.,  40., 100., ...,  14.,  23.,  33.],
       ...,
       [ 27.,  31.,  14., ..., 100.,  33.,  40.],
       [ 24.,  24.,   8., ...,  25., 100.,  18.],
       [ 30.,  18.,  33., ...,  40.,  18., 100.]])

In [178]:
matrixDf = convertMatrixToDf(matrix, ndc['cleaned_name'].tolist())

In [193]:
matrixDf.head()

Unnamed: 0,spironolactone %,-loreal,cardinal,church&dwight canada,preferred pharmaeutials,†wal-mart stores†,veterans,medco,st class,st medx,...,zoono,zrg detox,zunyi jici bio-health,zydus,zydus.1,zydus.2,zydus.3,zygone,zyla life sciences,name
0,100,17,25,17,5,24,8,10,25,26,...,29,24,22,10,10,10,10,27,24,30
1,26,100,40,15,27,33,27,17,27,14,...,17,12,29,0,0,0,0,31,16,18
2,17,40,100,29,32,16,25,15,25,13,...,15,24,28,15,15,15,15,14,23,33
3,17,15,29,100,19,16,21,8,29,22,...,8,21,15,8,8,8,8,15,11,17
4,21,27,32,28,100,15,32,14,19,13,...,0,19,14,21,21,21,21,7,24,7


<h3>FDA Matrix</h3>

In [50]:
fda = pd.read_csv("../data/working/fda_clean.csv", index_col = [0])

In [51]:
fda.columns

Index(['FDA Companies ', 'Company Clean'], dtype='object')

In [52]:
fda.fillna("-",inplace = True)

In [53]:
fda.shape

(975, 2)

In [55]:
fda_matrix = matrix_score(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), 975, 975)

In [56]:
fda_matrix

array([[100.,  13.,  41., ...,  21.,  21.,  28.],
       [ 13., 100.,  12., ...,   0.,   0.,   0.],
       [ 41.,  12., 100., ...,  20.,  20.,  27.],
       ...,
       [ 21.,   0.,  10., ..., 100., 100.,  50.],
       [ 21.,   0.,  10., ..., 100., 100.,  50.],
       [ 14.,   0.,  33., ...,  50.,  50., 100.]])

In [57]:
fda_matrix_df = convertMatrixToDf(fda_matrix, fda['Company Clean'].tolist())

In [58]:
fda_matrix_df.head()

Unnamed: 0,d imaging drug,m,m drug delivery,aaipharma,abbott,abbvie,abbvie endocrine,abbvie endocrine.1,abbvie.1,abhai,...,yabao,yaopharma,yung shin,zambon,zevacor,zo skin,zydus,zydus.1,zydus.2,zydus worldwide
0,100,13,41,26,10,10,20,20,10,11,...,11,17,26,20,19,29,21,21,21,28
1,13,100,12,20,0,0,0,0,0,0,...,0,20,0,29,0,0,0,0,0,0
2,41,12,100,8,0,10,26,26,10,10,...,10,8,33,10,9,18,20,20,20,27
3,26,20,17,100,13,27,24,24,27,43,...,29,78,11,27,25,12,0,0,0,8
4,10,0,0,13,100,50,36,36,50,36,...,55,27,0,50,31,15,0,0,0,10


In [25]:
fda_c = readCsv("../data/working/matrix_fda.csv")

In [26]:
f.head()

Unnamed: 0,d imaging drug,m,m drug delivery,aaipharma,abbott,abbvie,abbvie endocrine,abbvie endocrine.1,abbvie.1,abhai,...,yabao,yaopharma,yung shin,zambon,zevacor,zo skin,zydus,zydus.1,zydus.2,zydus worldwide
0,100,13,41,26,10,10,20,20,10,11,...,11,17,26,20,19,29,21,21,21,28
1,13,100,12,20,0,0,0,0,0,0,...,0,20,0,29,0,0,0,0,0,0
2,41,12,100,8,0,10,26,26,10,10,...,10,8,33,10,9,18,20,20,20,27
3,26,20,17,100,13,27,24,24,27,43,...,29,78,11,27,25,12,0,0,0,8
4,10,0,0,13,100,50,36,36,50,36,...,55,27,0,50,31,15,0,0,0,10


In [59]:
x = matchWithFuzzyNames(fda['Company Clean'].tolist(), fda['Company Clean'].tolist(), fda['FDA Companies '])

In [60]:
x.tail(50)

Unnamed: 0,original company names,clean company name,company matches,fuzz ratio,fuzz partial ratio
925,VIFOR FRESENIUS,vifor fresenius,[vifor fresenius],[100],[100]
926,VIIV HLTHCARE,viiv,[viiv],[100],[100]
927,VINTAGE,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
928,VINTAGE PHARMS,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
929,VINTAGE PHARMS LLC,vintage,"[vintage, vintage, vintage]","[100, 100, 100]","[100, 100, 100]"
930,VIRTUS PHARM,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
931,VIRTUS PHARMS,virtus,"[virtus, virtus]","[100, 100]","[100, 100]"
932,VISTAPHARM,vistapharm,[vistapharm],[100],[100]
933,VIVA HLTHCARE,viva,[viva],[100],[100]
934,VIVIMED GLOBAL,vivimed global,[vivimed global],[100],[100]


In [47]:
x = pd.read_csv("../data/working/fda_clean.csv", index_col = [0])

In [48]:
x.tail(50)

Unnamed: 0,FDA Companies,Company Clean
925,VIFOR FRESENIUS,vifor fresenius
926,VIIV HLTHCARE,viiv
927,VINTAGE,vintage
928,VINTAGE PHARMS,vintage
929,VINTAGE PHARMS LLC,vintage
930,VIRTUS PHARM,virtus
931,VIRTUS PHARMS,virtus
932,VISTAPHARM,vistapharm
933,VIVA HLTHCARE,viva
934,VIVIMED GLOBAL,vivimed global


<h3>DNA Matrix subset</h3>

In [39]:
dna = pd.read_csv("../data/working/dna_clean.csv", index_col = [0])

In [40]:
dna.shape

(64005, 4)

In [41]:
dna.dropna(inplace = True)

In [70]:
dna_subset = dna[21318:42636]

In [71]:
dna_subset

Unnamed: 0,Unnamed: 0.1,Code,Description,cleaned_companies
21341,35411,FIRBUS,First Busey Corporation,first busey
21342,35415,FIRCPT,Emerald Health Therapeutics Inc.,emerald therapeutics
21343,35416,FIRDER,First Derivatives PLC,first derivatives
21344,35418,FIRDIA,Firestone Diamonds PLC,firestone diamonds
21345,35419,FIREEI,FireEye Inc.,fireeye
...,...,...,...,...
42673,70649,OMCCAP,NanoSphere Health Sciences Inc.,nanosphere sciences
42674,70650,OMCMST,Omnicomm Systems Inc,omnicomm
42675,70651,OMCRED,OmniAmerican Credit Union,omniamerican credit union
42676,70652,OMDBZZ,1Malaysia Development Bhd,malaysia development


In [72]:
dna_subset['row'] = np.arange(0,len(dna_subset))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [73]:
len(dna_subset)

21318

In [79]:
#3553
partOne = dna_subset[0:3553]
partTwo = dna_subset[3553:7016]
partThree = dna_subset[7016:10659]
partFour = dna_subset[10659:14212]
partFive = dna_subset[14212:17765]
partSix = dna_subset[17765:21318]

In [80]:
partSix.tail()

Unnamed: 0,Unnamed: 0.1,Code,Description,cleaned_companies,row
42673,70649,OMCCAP,NanoSphere Health Sciences Inc.,nanosphere sciences,21313
42674,70650,OMCMST,Omnicomm Systems Inc,omnicomm,21314
42675,70651,OMCRED,OmniAmerican Credit Union,omniamerican credit union,21315
42676,70652,OMDBZZ,1Malaysia Development Bhd,malaysia development,21316
42677,70655,OMDCI,OMD Chicago Inc.,omd chicago,21317


In [81]:
dna_subset_matrix = matrix_score(partOne['cleaned_companies'].tolist(), dna['cleaned_companies'].tolist(), 3553,63954)

KeyboardInterrupt: 

In [49]:
dna_subset_matrix

NameError: name 'dna_subset_matrix' is not defined

<h4>Jaccard</h4>

 Formula: 1 - (|intersection of A,B| / |A| + |B| - |A union B|)

In [40]:
def matchWithJaccard(l1, l2):
    l1 = set(l1)
    l2 = set(l2)
    for i in l1:
        print("\nCompany in list 1 {} matches".format(i))
        i = set(i)
        for j in l2:
            j = set(j)
            print("{} with a jaccard score of {}".format(j, jaccard_distance(i, j)))

In [41]:
mylist = ["nike", "tesla", "johnson&johnson"]
mylist2 = ['nike', "tesla", "john"]

In [42]:
matchWithJaccard(mylist, mylist2)


Company in list 1 nike matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.0
{'o', 'n', 'h', 'j'} with a jaccard score of 0.8571428571428571
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.875

Company in list 1 johnson&johnson matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.8888888888888888
{'o', 'n', 'h', 'j'} with a jaccard score of 0.3333333333333333
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.9

Company in list 1 tesla matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.875
{'o', 'n', 'h', 'j'} with a jaccard score of 1.0
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.0


In [43]:
s1 = set([1,2,3,4])
s2 = set([3,4,5])

In [49]:
print(jaccard_distance(s1, s2))

0.6


In [51]:
s1 = {1, 2, 3, 4, 5}
s2 = {4, 5, 6, 7, 8, 9, 10}

In [52]:
print(jaccard_distance(s1,s2))

0.8


In [53]:
print(s1 & s2)

{4, 5}
