In [23]:
import numpy as np
import pandas as pd
import string
import re
from fuzzywuzzy import fuzz 
from nltk.metrics import *

<h2>Reading in the datasets</h2>

In [2]:
dna = pd.read_csv("../data/working/validcompaniesdictionary.csv", index_col = [0])

In [3]:
fda = pd.read_excel("../data/original/fda_companies.xlsx")

In [24]:
ndc = pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

<h2>Neil's code for cleaning</h2>

In [5]:
removeset=string.punctuation
removeset=removeset.replace("-","") #Don't remove dashes
removeset=removeset.replace("&","") #Don't remove ampersand
removeset=removeset.replace("_","") #Don't remove underscore
removeset=removeset.replace("%","") #Don't remove percent
removeset=removeset.replace("$","") #Don't remove dollar
   
print(removeset)

!"#'()*+,./:;<=>?@[\]^`{|}~


In [183]:
# remove all single characters (This step is done first, because later there are single chars we want to retain.)
#document = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(X[sen]))
string = "hello i world"
string = re.sub(r'\s+[a-zA-Z]\s+', ' ', string)
print(string)

# remove all numbers
#document = re.sub(r'[0-9]','', document)
string2 = "h3ll0"
string2 = re.sub(r'[0-9]','', string)
print(string2)

# Substituting multiple spaces with single space
#document = re.sub(r'\s+', ' ', document, flags=re.I)
string3 = "hello      world"
string3 = re.sub(r'\s+', ' ', string3, flags=re.I)
print(string3)

#Converting to lowercase
string4 = "HEllo WORLD"
string4 = string4.lower()
print(string4)

#Removing prefixed 'b'
#document = re.sub(r'^b\s+', '', document)
string5 = "b hello world"
string5 = re.sub(r'^b\s+', '', string5)
print(string5)

#Make dashes into combined words
#document = re.sub(r'\s-\s+', '-', document)
string6 = "hello - world"
string6 = re.sub(r'\s-\s+', '-', string6)
print(string6)

#Make ampersand into combined words
#document = re.sub(r'\s&\s+', '&', document)
string7 = "hel & lo & world"
string7 = re.sub(r'\s&\s+', '&', string7)
print(string7)

#Make underscore into combined words
#document = re.sub(r'\s_\s+', '_', document)
string8 = "hel _ lo wo _ rld"
string8 = re.sub(r'\s_\s+', '_', string8)
print(string8)

hello world
hello world
hello world
hello world
hello world
hello-world
hel&lo&world
hel_lo wo_rld


In [77]:
#removes all punctuation in string that is in removeset
document = "Johnson+;Johnson!"
for i in removeset:
    document=re.sub(re.escape(i),"",document)
print(document)

JohnsonJohnson


<h2>Cleaning NDC</h2>

<h4>Removing the first 25 since they are just numbers</h4>

In [25]:
#Getting rid of the first 25 since those are just numbers
ndc = ndc.iloc[25:]
#renaming column
ndc = ndc.rename(columns = {'Row Labels':'company'})

<h4>Lowercase everything</h4>

In [26]:
#Converting to lower first
ndc.company = ndc.company.str.lower()

<h4>Remove content that are in parentheses</h4>

In [27]:
#Function for removing parentheses content
def removeParenthesesContent(string):
    return re.sub(r'\([^)]*\)', '', string)


In [28]:
ndc['companiesWithoutParensContent'] = ndc['company'].apply(removeParenthesesContent)

In [29]:
del ndc['company']

In [30]:
ndc.head()

Unnamed: 0,companiesWithoutParensContent
25,spironolactone 2%
26,-l'oreal usa products inc
27,.cardinal health
28,.church & dwight canada corp
29,{preferred pharmaeutials inc.


<h4>Remove Unwanted Punctuation</h4>

In [31]:
"""
Ignore 
#Function that uses regex to remove parentheses and square brackets
def removeParenthesis(string):
    return re.sub('[()\{}]', '', string)
"""
#function that gets rid of unwanted punctuation
#This does get rid of ' within a string (ex. l'oreal becomes l oreal) so maybe recheck?
def removeUnwantedPunc(string):
    return re.sub('[!"#\'()*+,./:;<=>?@[\]^`{|}~]', '', string)

In [32]:
ndc['companiesWithoutUnwantedPunc'] = ndc['companiesWithoutParensContent'].apply(removeUnwantedPunc)

In [33]:
del ndc['companiesWithoutParensContent']

In [34]:
ndc.rename(columns = {"companiesWithoutUnwantedPunc": "company"}, inplace = True)

In [35]:
#At this point all the companies are lowercased, don't have parenthetical content, and don't have unwanted punctuation
ndc.head()

Unnamed: 0,company
25,spironolactone 2%
26,-loreal usa products inc
27,cardinal health
28,church & dwight canada corp
29,preferred pharmaeutials inc


<h4>Removing numbers</h4>

In [36]:
#Function that will remove numbers
def removeNumbers(string):
    return re.sub(r'[0-9+]','', string)

In [37]:
ndc['companiesWithNoNumbers'] = ndc.company.apply(removeNumbers)

In [38]:
del ndc['company']

In [39]:
ndc = ndc.rename(columns = {"companiesWithNoNumbers": 'company'})

In [40]:
ndc.head()

Unnamed: 0,company
25,spironolactone %
26,-loreal usa products inc
27,cardinal health
28,church & dwight canada corp
29,preferred pharmaeutials inc


<h4>Removing Single Chars</h4>

In [41]:
def removeSingleChar(string):
    return re.sub(r'\s+[a-zA-Z]\s+', ' ', string)

In [42]:
ndc['companiesWithNoSingleChar'] = ndc.company.apply(removeSingleChar)

In [43]:
del ndc['company']
ndc = ndc.rename(columns={"companiesWithNoSingleChar": 'company'})

<h4>Substituting Multiple Spaces with Single Space</h4>

In [44]:
def subMultipleSpacesForOne(string):
    return re.sub(r'\s+', ' ', string, flags=re.I)

In [45]:
ndc['noMultipleSpaces'] = ndc.company.apply(subMultipleSpacesForOne)

In [46]:
del ndc['company']
ndc = ndc.rename(columns = {"noMultipleSpaces": 'company'})

<h4>Removing Prefix b</h4>

In [47]:
def removePrefix(string):
    return re.sub(r'^b\s+', '', string)

In [48]:
ndc['removedPrefix'] = ndc.company.apply(removePrefix)

In [49]:
ndc.columns

Index(['company', 'removedPrefix'], dtype='object')

In [50]:
del ndc['company']
ndc = ndc.rename(columns = {"removedPrefix":"company"})

<h4>Make dashes into combined words</h4>

In [51]:
#start here
def makeDashCombined(string):
    return re.sub(r'\s-\s+', '-', string)

In [52]:
ndc['combinedDash'] = ndc.company.apply(makeDashCombined)

In [53]:
del ndc['company']
ndc = ndc.rename(columns = {"combinedDash": 'company'})

<h4>Combine Ampersand</h4>

In [54]:
def combineAmpersand(string):
    return re.sub(r'\s&\s+', '&', string)

In [55]:
ndc['combineAmp'] = ndc.company.apply(combineAmpersand)

In [56]:
ndc.columns

Index(['company', 'combineAmp'], dtype='object')

In [57]:
del ndc['company']
ndc = ndc.rename(columns = {"combineAmp": 'company'})

In [58]:
ndc.columns

Index(['company'], dtype='object')

<h4>Combine '_'</h4>

In [59]:
def combinedUnderScore(string):
    return re.sub(r'\s_\s+', '_', string)

In [60]:
ndc.columns

Index(['company'], dtype='object')

In [61]:
ndc['combinedUnder'] = ndc.company.apply(combinedUnderScore)

In [62]:
del ndc['company']
ndc = ndc.rename(columns = {"combinedUnder": 'company'})

<h2>End Initial Cleaning</h2>

<h4>Grabbing the list of legal entities from os github</h4>

In [63]:
legalEntities = pd.read_csv("https://raw.githubusercontent.com/DSPG-Young-Scholars-Program/dspg20oss/danBranch/ossPy/keyFiles/curatedLegalEntitesRaw.csv", quotechar = "'",header = None)
legalEntities.head()

Unnamed: 0,0
0,(?i) Inc\b
1,(?i) Ltd\b
2,(?i) LLC\b
3,(?i) GmbH\b
4,(?i) Corporation\b


<h4>Using Daniel's code to get remove legal entities</h4>

In [64]:
def eraseFromColumn(inputColumn, eraseList):
    "iteratively delete regex query matches from input list"
    
    """
    inputColumn -- a column from a pandas dataframe, this will be the set of
    target words/entries that deletions will be made from
    eraseList -- a column containing strings (regex expressions) which will be
    deleted from the inputColumn, in an iterative fashion
    """
    eraseList['changeNum'] = 0
    eraseList['changeIndexes'] = ''
    
    inputColumn = inputColumn.replace(regex=True, to_replace = "\\\\", value='/')
    
    for index, row in eraseList.iterrows():
        curReplaceVal = row[0]
        currentRegexExpression=re.compile(curReplaceVal)
        CurrentBoolVec=inputColumn.str.contains(currentRegexExpression, na= False)
        eraseList['changeIndexes'].iloc[index]=[i for i, x in enumerate(CurrentBoolVec) if x]
        eraseList['changeNum'].iloc[index] = len(eraseList['changeIndexes'].iloc[index])
        inputColumn.replace(regex=True, to_replace=currentRegexExpression,value='', inplace = True)
    
    return inputColumn, eraseList

In [65]:
output, eraseList = eraseFromColumn(ndc.company, legalEntities)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [66]:
#This dataframe is lowercased, doesn't have any of the unwanted punctuation, and has removed all legal entity labels
output= pd.DataFrame(output)

In [67]:
output.head(50)

Unnamed: 0,company
25,spironolactone %
26,-loreal usa products
27,cardinal health
28,church&dwight canada
29,preferred pharmaeutials
30,†wal-mart stores†
31,veterans health
32,medco
33,st class pharmaceuticals
34,st medx


<h4>Counting Unique String Tokens using Daniel's code, then adding more to the list from Isabel and Susweta's lists</h4>

In [68]:
ndc_cleaned = output

In [69]:
longString = ndc_cleaned['company'].str.cat(sep = " ")

In [70]:
longStringSeperated = longString.split(' ')

In [71]:
uniqueSubTokenFrame = pd.DataFrame(longStringSeperated)

In [72]:
columnUniqueCounts = uniqueSubTokenFrame.iloc[:,0].value_counts()

In [73]:
tableUniqueCounts = columnUniqueCounts.reset_index()

In [74]:
tableUniqueCounts.rename(columns = {0: "count", "index":"token"}, inplace = True)

In [75]:
top20 = tableUniqueCounts.head(20).token.tolist()

In [76]:
#top 20 tokens in ndc
top20

['pharmaceuticals',
 'medical',
 'products',
 'laboratories',
 'pharma',
 'supply',
 'anda',
 'health',
 'pharmaceutical',
 'usa',
 'international',
 'care',
 'and',
 'nda',
 'coltd',
 'the',
 'home',
 'healthcare',
 '',
 'of']

In [77]:
#top occuring tokens from fda and dna
top20.append("group")
top20.append("holdings")
top20.append("capital")
top20.append("technologies")
top20.append("association")
top20.append('us')
top20.append('services')
top20.append("university")
top20.append("bank")
top20.append("partners")
top20.append("energy")
top20.append("systems")
top20.append("intl")
top20.append("pharms")
top20.append("american")
top20.append("national")

In [79]:
#Top 20 occuring tokens in ndc plus some top occuring tokens in dna and fda
top20

['pharmaceuticals',
 'medical',
 'products',
 'laboratories',
 'pharma',
 'supply',
 'anda',
 'health',
 'pharmaceutical',
 'usa',
 'international',
 'care',
 'and',
 'nda',
 'coltd',
 'the',
 'home',
 'healthcare',
 '',
 'of',
 'group',
 'holdings',
 'capital',
 'technologies',
 'association',
 'us',
 'services',
 'university',
 'bank',
 'partners',
 'energy',
 'systems',
 'intl',
 'pharms',
 'american',
 'national']

<h4>Getting rid of top occuring tokens</h4>

In [86]:
ndc_cleaned = ndc_cleaned['company'].apply(lambda x: ' '.join([word for word in x.split() if word not in (top20)])) #Isabel's code

In [87]:
ndc_cleaned = pd.DataFrame(ndc_cleaned)

In [88]:
ndc_cleaned

Unnamed: 0,company
25,spironolactone %
26,-loreal
27,cardinal
28,church&dwight canada
29,preferred pharmaeutials
...,...
7046,zygone
7047,zyla life sciences
7048,name
7049,


<h4>Adding csv to working data</h4>

In [89]:
#match these datasets together

In [90]:
og = pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

In [91]:
og = og.iloc[25:]

In [93]:
og['cleaned_name'] = ndc_cleaned.company.tolist()

In [94]:
og.head(20)

Unnamed: 0,Row Labels,cleaned_name
25,SPIRONOLACTONE 2%,spironolactone %
26,-L'Oreal USA Products Inc,-loreal
27,.Cardinal Health,cardinal
28,.Church & Dwight Canada Corp,church&dwight canada
29,{Preferred Pharmaeutials INC.,preferred pharmaeutials
30,"†Wal-Mart Stores, Inc.†",†wal-mart stores†
31,1 Veterans Health,veterans
32,111 Medco,medco
33,"1st Class Pharmaceuticals, Inc.",st class
34,1ST MEDX LLC,st medx


In [95]:
og = og.rename(columns = {"Row Labels": "original_company"})

In [96]:
og.tail()

Unnamed: 0,original_company,cleaned_name
7046,ZYGONE,zygone
7047,Zyla Life Sciences US Inc.,zyla life sciences
7048,#NAME?,name
7049,(blank),
7050,Grand Total,grand total


In [97]:
len(og)

7026

In [98]:
og.dropna(inplace = True)

In [100]:
og.drop([7049], inplace= True)

In [102]:
og.drop([7050], inplace = True)

In [103]:
og.tail()

Unnamed: 0,original_company,cleaned_name
7044,Zydus Pharmaceuticals USA Inc.,zydus
7045,Zydus Technologies Limited,zydus
7046,ZYGONE,zygone
7047,Zyla Life Sciences US Inc.,zyla life sciences
7048,#NAME?,name


In [107]:
og.to_csv("../data/working/ndc_clean.csv")

OSError: [Errno 28] No space left on device: '../data/working/ndc_clean.csv'

In [105]:
x = pd.read_csv("../data/working/ndc_clean.csv", index_col = [0])

In [106]:
x.tail(10)

Unnamed: 0,original_company,cleaned_name
7039,"ZOONO USA, LLC",zoono
7040,"ZRG DETOX, INC.",zrg detox
7041,Zunyi Jici Bio-Health Products Co Ltd,zunyi jici bio-health
7042,Zydus Pharmaceuticals (USA) Inc.,zydus
7043,Zydus Pharmaceuticals USA Inc,zydus
7044,Zydus Pharmaceuticals USA Inc.,zydus
7045,Zydus Technologies Limited,zydus
7046,ZYGONE,zygone
7047,Zyla Life Sciences US Inc.,zyla life sciences
7048,#NAME?,name
