Lets begin by importing the csv file with the company name list

In [1]:
import pandas as pd


#establish path
pathToFDAList='/home/dnb3k/Downloads/NDC_Company_Dataset.xls'

#load it
fdaCompanyList=pd.read_excel(pathToFDAList)

fdaCompanyList=fdaCompanyList.rename(columns={'Row Labels': 'company'})
fdaCompanyList=fdaCompanyList.astype(str)

#view what we loaded
fdaCompanyList.head(30)



Unnamed: 0,company
0,19750101
1,20090901
2,20141217
3,20150301
4,20150801
5,20160601
6,20161012
7,20161026
8,20181130
9,20190501


Now that we have taken a quick look at it, lets look at the unique subtokens (i.e. "words" that are found in these entries.  We'll use a strategy adopted in a previous notebook

In [2]:
#cat all the row entries into one long string
longString=fdaCompanyList['company'].str.cat(sep=' ')

#separate each "word" (space separated token) into a extremely long list
longStringSeparated=longString.split(' ')

#turn it into a dataframe
uniqueSubTokenFrame=pd.DataFrame(longStringSeparated)

#get the count on that column, this tells us the frequency of those unique tokens
columnUniqueCounts=uniqueSubTokenFrame.iloc[:,0].value_counts()

#convert that output to a proper table
tableUniqueCounts=columnUniqueCounts.reset_index()
tableUniqueCounts.rename(columns={0:"count","index":"token"},inplace=True)

print(tableUniqueCounts.shape)
print('number of unique string tokens in this dataset')

tableUniqueCounts.head(20)

(7418, 2)
number of unique string tokens in this dataset


Unnamed: 0,token,count
0,Inc.,1636
1,LLC,1168
2,Inc,651
3,"Co.,",295
4,Medical,290
5,Pharmaceuticals,279
6,Ltd.,269
7,"Pharmaceuticals,",242
8,&,203
9,Products,168


As we can see there are a number of very common legal entity substrings which could be leading to some confusion as we try and merge and de-duplcate this data set (ie. rows 0, 1, 2, 3, 6, 10, 11, 13, 15, 17 -- That's half!).  Lets remove these using a curated list of legal entities.

In [16]:
import os
#infer directory structure from location of ossPyFuncs file.  Open to suggestions on how to do this better.
currentDir='/home/dnb3k/git/dspg20oss/ossPy'

os.chdir(currentDir)

import ossPyFuncs
import numpy as np
#construct path to legal entity list
LElist=pd.read_csv(os.path.join(currentDir,'keyFiles/curatedLegalEntitesRaw.csv'),quotechar="'",header=None)

#perform the erasure
LEoutput, LEeraseList=ossPyFuncs.eraseFromColumn(fdaCompanyList['company'],LElist)

#format the output
LEoutput=pd.DataFrame(LEoutput)
LEeraseList=LEeraseList.sort_values(by='changeNum',ascending=False)
#view some of the output statistics
LEeraseList.head(15)

print(str(np.sum(LEeraseList['changeNum']))+' total entries changed')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


6070


Now that we've removed those legal entity items, lets also remove some extraneous symbols

In [4]:
symbollist=pd.read_csv(os.path.join(currentDir,'keyFiles/fdaCustomSymbols.csv'),quotechar="'",header=None)

symbollist.head(10)

Unnamed: 0,0
0,[ \t]+$
1,^[ \t]
2,\A\s+
3,\s+\Z
4,/$
5,","
6,\.$
7,[ \t]+$


In [5]:
Symboloutput, symbolEraseList=ossPyFuncs.eraseFromColumn(LEoutput['company'],symbollist)

symbolEraseList.head(10)

Unnamed: 0,0,changeNum,changeIndexes
0,[ \t]+$,6,"[25, 85, 1134, 5794, 5877, 6304]"
1,^[ \t],1,[25]
2,\A\s+,0,[]
3,\s+\Z,0,[]
4,/$,0,[]
5,",",2919,"[30, 33, 38, 43, 44, 49, 50, 53, 55, 59, 70, 7..."
6,\.$,2494,"[29, 33, 44, 51, 52, 58, 59, 64, 67, 68, 71, 7..."
7,[ \t]+$,8,"[86, 1069, 1159, 3396, 3433, 3537, 5417, 5954]"


Now that we've performed this cleaning we can get a sense of how close some of these strings are to one another using a fuzzy match algorithm

In [7]:
fuzzyMatchFrame=ossPyFuncs.iterativeFullFuzzyMatch(pd.DataFrame(Symboloutput))

fuzzyMatchFrame=fuzzyMatchFrame.sort_values(by=['count','company'],ascending=False)

fuzzyMatchFrame=fuzzyMatchFrame.reset_index(drop=True)

pd.set_option("display.max_rows",None)

fuzzyMatchFrame.head(len(fuzzyMatchFrame.index))

Unnamed: 0,company,count,guesses
0,Valu Merchandisers,8,"[Value Merchandisers, Valu Merchandiser, Valu ..."
1,Walgreens,7,"[Wallgreens, Walgreen's, Walgreen]"
2,Dr. Reddy's Laboratories,7,"[Dr.Reddy's Laboratories, Dr. Reddys Laborator..."
3,Wal-Mart Stores,6,"[†Wal-Mart Stores.†, Wal-Mart StoresInc, Walma..."
4,Target,6,[]
5,Shopko Stores Operating,6,"[Shopko Stores Operating., Shopko Store Operat..."
6,Rejoice International,6,"[Rece International, Ningbo Rejoice Internatio..."
7,Kroger,6,[Krogers]
8,Great Lakes Wholesale Marketing and Sales,6,[]
9,Family Dollar Services,6,[Family Dollar Servicesm]


Having performed that fuzzy match overview we can also begin trying to assess the internal relations of the various unique entries in this data column

In [8]:
referenceFrame,matrixOut=ossPyFuncs.createSubstringMatrix(pd.DataFrame(Symboloutput))

print(matrixOut.shape)

(6024, 6024)


In [17]:
referenceFrame.head(len(referenceFrame.index))

Unnamed: 0,company,count,outdegree,indegree
4824,Chain Drug,1,19,1
421,McKesson,2,13,1
3711,Home Health,1,12,1
4818,Chain Drug Marketing,1,10,2
1691,Select,1,9,1
11,Cardinal Health,6,9,1
2357,Oxygen Service,1,8,1
1628,Shopko,1,8,1
4572,DeMoulas,1,7,1
4139,Fougera Pharmaceuticals,1,7,1


In [10]:
#establish the table columns
referenceFrame['outdegree']=0
referenceFrame['indegree']=0

#convert to manipulable format
#matrixOut=fuzzyMatchFrame.tocsr()

#iterate across nodes
for index, row in referenceFrame.iterrows():
    
        #compute indegree and outdegree
        referenceFrame['outdegree'].loc[index]=matrixOut[index,:].count_nonzero()
        referenceFrame['indegree'].loc[index]=matrixOut[:,index].count_nonzero()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Lets view the high indegree nodes.  These are the nodes (company names) which have the highest number of *other* entries mapping on to them.

In [18]:
referenceFrame=referenceFrame.sort_values(by='indegree',ascending=False)

referenceFrame.head(20)

Unnamed: 0,company,count,outdegree,indegree
1087,United Exchange.Chain Drug Marketing Association,1,1,7
1359,TOP CARE (Topco Associates),1,0,6
1358,TOPCARE (Topco Associates),1,0,6
5467,AmerisourceBergen Drug (Good Neighbor Pharmacy...,1,1,5
89,Topco Associates,3,4,5
4815,Chain Drug Marketing Association (CDMA),1,0,5
2050,QUALITY CHOICE (Chain Drug Marketing Associati...,1,0,5
1188,Topco associates,1,4,5
4368,EQUATE (Walmart Stores.),1,0,5
2051,QUALITY CHOICE (Chain Drug Marketing Association),1,0,5


And now the high outdegree nodes. These are the nodes (company names) which are mapping on to the greates number of *other* company names.  In this sense they are somewhat "nonspecific".

In [12]:
referenceFrame=referenceFrame.sort_values(by='outdegree',ascending=False)

referenceFrame.head(20)

Unnamed: 0,company,count,outdegree,indegree
4824,Chain Drug,1,19,1
421,McKesson,2,13,1
3711,Home Health,1,12,1
4818,Chain Drug Marketing,1,10,2
1691,Select,1,9,1
11,Cardinal Health,6,9,1
2357,Oxygen Service,1,8,1
1628,Shopko,1,8,1
4572,DeMoulas,1,7,1
4139,Fougera Pharmaceuticals,1,7,1


In [13]:
#necessary to do it here once, as doing it multiple times won't work
matrixOut=matrixOut.todense()

Finally, we'll implement a widget that will allow us to inspect each company, and its indegree and outdegree nodes.  These will be selected by index, as sorted by the most common in the input column (after cleaning).

In [20]:
import numpy as np
import ipywidgets as widgets

referenceFrame=referenceFrame.sort_index()

def displayInDOutDItems(iIndex):

    rowVec=np.squeeze(np.asarray(matrixOut[iIndex,:]))
    colVec=np.squeeze(np.asarray(matrixOut[:,iIndex]))

    mappedNamesRow=referenceFrame['company'].loc[rowVec]
    mappedNamesColumn=referenceFrame['company'].loc[colVec]
    
    print('company name')
    print(referenceFrame['company'].loc[iIndex])
    print('\n')
    print('out nodes (nodes mapped to)')
    print(mappedNamesRow)
    print('\n')
    print('in nodes (nodes that map to this)')
    print(mappedNamesColumn)
    print('\n')
    
widgets.interact(displayInDOutDItems,iIndex=referenceFrame.index,continuous_update=False)

interactive(children=(Dropdown(description='iIndex', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14…

<function __main__.displayInDOutDItems(iIndex)>