In [1]:
import numpy as np
import pandas as pd
import nltk
from fuzzywuzzy import fuzz
from nltk.metrics import jaccard_distance



<h4>Read in the data</h4>

In [37]:
ndc = pd.read_csv("../data/working/cleanNDC.csv", index_col = [0])

<h4>Drop any rows that had NaN</h4>

In [38]:
ndc.dropna(inplace = True)

In [39]:
ndc.shape

(6725, 2)

<h4>Function that uses fuzzy matching against two lists</h4>

In [83]:
#function that takes two arguments: df1.columnName.tolist() as l1 and df2.columnName.tolist() as l2
#This function does two things. First, it print the current word in list 1 and then all the words its being matched to, along with the fuzzy ratio score
#Second, it stores each score in an array and returns the array 


#Step 1 is making the matrix, where each row is the collection of scores for one word compared to all other words
#[50, 100, 50, 60, 100] this row would be the fuzzratio scores for the first word against the all the others
#[75, 100, 40, 50, 100] this row would be the fuzz ratio scores for the second word against all the others
#and so on for the entire dataset


#---------------------------
#Build a matix for all three datasets

#Come up with a csv file with the original company name, and the cleaned name

#if score is 100, then the company names in the cleaned version are the same; keep track of the row number for each company
#in order to go back to the raw list and see what the actual companies were
#abvee and abvee



#when the matrix is made, come up with a threshold. If a match falls outside of the threshold, then we change it to 



def matchWithFuzzy(l1, l2):
    score = []
    for i in l1:
        print("\nWord in list 1 {} matches:".format(i))
        for j in l2:
            score.append(fuzz.ratio(i, j))
            print("{} with a score of {}".format(j,fuzz.ratio(i,j)))
    
    return score 

In [84]:
subset = ndc.iloc[200:205]

In [118]:
print(matchWithFuzzy(subset.company.tolist(), subset.company.tolist()))


Word in list 1 air chemicals matches:
air chemicals with a score of 100
air source industries with a score of 47
air source industries with a score of 47
air north texas with a score of 57
air techniques with a score of 59

Word in list 1 air source industries matches:
air chemicals with a score of 29
air source industries with a score of 100
air source industries with a score of 100
air north texas with a score of 28
air techniques with a score of 40

Word in list 1 air source industries matches:
air chemicals with a score of 29
air source industries with a score of 100
air source industries with a score of 100
air north texas with a score of 28
air techniques with a score of 40

Word in list 1 air north texas matches:
air chemicals with a score of 57
air source industries with a score of 44
air source industries with a score of 44
air north texas with a score of 100
air techniques with a score of 48

Word in list 1 air techniques matches:
air chemicals with a score of 59
air source 

<h4>Function that creates a fuzzy score matrix</h4>

In [35]:
#Function that creates a matrix of fuzzy scores
#l1 and l2 are the lists
#nrows will be the number of rows you will need in the matrix
#ncols will be the number of columns you will need


#First try to incorporate pandas and numpy; the final output should be a pandas dataframe that is a matrix. If the code is still too slow, then break up the dna dataset
#into three 20000x60000 have three seperate jupyters running, using this function for each third of the data(dna), and get 
#put write.csv into function(will have three csvs, which will need to combined)

#try to get a frequency distribution of the matches


def matrix_score(l1, l2, nrows, ncols):
    #creates a matrix of zeros with the dimensions specified from the parameters
    #So if you had a list that had 5 elements in it, your matrix would be a 5x5 matrix; nrows would be 5 and ncols would be 5
    matrix = np.zeros(shape = (nrows, ncols))
    
    #initialize row and col values to 0
    row = 0
    col = 0
    
    for i in l1:
        for j in l2:
            score = fuzz.ratio(i,j)
            matrix[row][col] = score
            col+=1
        col = 0
        row+=1
    
    return matrix 

In [40]:
matrix = matrix_score(ndc.cleaned_name.tolist(), ndc.cleaned_name.tolist(), 6725, 6725)

In [41]:
matrix

array([[100.,  17.,  25., ...,  22.,  30.,  30.],
       [ 26., 100.,  40., ...,  14.,  18.,  33.],
       [ 17.,  40., 100., ...,  21.,  33.,  42.],
       ...,
       [ 22.,  21.,   7., ..., 100.,  16.,   6.],
       [ 30.,  18.,  33., ...,  16., 100.,  27.],
       [ 30.,  33.,  42., ...,  19.,  13., 100.]])

<h4>Converting matrix into a dataframe</h4>

In [50]:
#This function takes the matrix and list of cleaned company names as arguments and returns a dataframe representing the martrix passed
def convertMatrixToDf(matrix, ls):
    #In order for the code to work, the matrix passed must first be converted from float to int
    matrix = matrix.astype(int)
    #Then the matrix needs to be converted to str
    matrix = matrix.astype(str)
    cleaned_matrix = [[int(item.split()[0]) for item in row] for row in matrix]
    matrix_scores = pd.DataFrame(cleaned_matrix, columns = ls)
    #Since there are no company name rows, this function makes them
    matrix_scores['companies'] = ls
    #Sets the company name rows to the df index
    matrix_scores.set_index('companies', inplace = True)
    
    #returns the matrix as a dataframe, with type int
    return matrix_scores.astype(int)

In [51]:
matrixDf = convertMatrixToDf(matrix, ndc.cleaned_name.tolist())

In [53]:
matrixDf.to_csv("../data/working/ndc_matrix.csv")

<h4>Isabel's Function</h4>

In [111]:
def matchWithFuzzyNames(l1, l2):
    match = pd.DataFrame(columns = ['company name', 'company matches', 'fuzz ratio'])
    for i in l1:
        matches = []
        score = []
        score_partial = []
        for j in l2:
            if (fuzz.ratio(i,j)) > 45:
                matches.append(j)
                score.append(fuzz.ratio(i,j))
                score_partial.append((fuzz.partial_ratio(i,j)))
                match = match.append({'company name': i,
                                      'company matches': matches, 
                                    'fuzz ratio': score,
                                    'fuzz partial ratio': score_partial},
                                    ignore_index=True)

 

    return match.head(len(match))

In [2]:
#print(matchWithFuzzyNames(ndc.cleaned_name.tolist(), ndc.cleaned_name.tolist()))

<h4>Breaking up DNA into a subset</h4>

In [29]:
dna = pd.read_csv("../data/working/dna_clean.csv", index_col = [0])

In [30]:
dna.dropna(inplace = True)

In [31]:
dna_subset = dna.iloc[21318:42636]

In [32]:
dna_subset.shape

(21318, 4)

In [34]:
dna_subset_matrix = matrix_score(dna_subset['cleaned_companies'].tolist(), dna_subset['cleaned_companies'].tolist(), 21318,21318)

KeyboardInterrupt: 

<h4>Jaccard</h4>

 Formula: 1 - (|intersection of A,B| / |A| + |B| - |A union B|)

In [40]:
def matchWithJaccard(l1, l2):
    l1 = set(l1)
    l2 = set(l2)
    for i in l1:
        print("\nCompany in list 1 {} matches".format(i))
        i = set(i)
        for j in l2:
            j = set(j)
            print("{} with a jaccard score of {}".format(j, jaccard_distance(i, j)))

In [41]:
mylist = ["nike", "tesla", "johnson&johnson"]
mylist2 = ['nike', "tesla", "john"]

In [42]:
matchWithJaccard(mylist, mylist2)


Company in list 1 nike matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.0
{'o', 'n', 'h', 'j'} with a jaccard score of 0.8571428571428571
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.875

Company in list 1 johnson&johnson matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.8888888888888888
{'o', 'n', 'h', 'j'} with a jaccard score of 0.3333333333333333
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.9

Company in list 1 tesla matches
{'e', 'k', 'n', 'i'} with a jaccard score of 0.875
{'o', 'n', 'h', 'j'} with a jaccard score of 1.0
{'a', 's', 'l', 'e', 't'} with a jaccard score of 0.0


In [43]:
s1 = set([1,2,3,4])
s2 = set([3,4,5])

In [49]:
print(jaccard_distance(s1, s2))

0.6


In [51]:
s1 = {1, 2, 3, 4, 5}
s2 = {4, 5, 6, 7, 8, 9, 10}

In [52]:
print(jaccard_distance(s1,s2))

0.8


In [53]:
print(s1 & s2)

{4, 5}
