In [62]:
import numpy as np
import pandas as pd
import string
import re
from fuzzywuzzy import fuzz

<h2>Reading in the datasets</h2>

In [2]:
dna = pd.read_csv("../data/working/validcompaniesdictionary.csv", index_col = [0])

In [3]:
fda = pd.read_excel("../data/original/fda_companies.xlsx")

In [129]:
ndc = pd.read_excel("../data/original/BI DSPG Company Datasets/NDC_Company_Dataset.xls")

<h2>Neil's code for cleaning</h2>

In [5]:
removeset=string.punctuation
removeset=removeset.replace("-","") #Don't remove dashes
removeset=removeset.replace("&","") #Don't remove ampersand
removeset=removeset.replace("_","") #Don't remove underscore
removeset=removeset.replace("%","") #Don't remove percent
removeset=removeset.replace("$","") #Don't remove dollar
   
print(removeset)

!"#'()*+,./:;<=>?@[\]^`{|}~


In [76]:
# remove all single characters (This step is done first, because later there are single chars we want to retain.)
#document = re.sub(r'\s+[a-zA-Z]\s+', ' ', str(X[sen]))
string = "hello i world"
string = re.sub(r'\s+[a-zA-Z]\s+', ' ', string)
print(string)

# remove all numbers
#document = re.sub(r'[0-9]','', document)
string2 = "h3ll0"
string2 = re.sub(r'[0-9]','', string)
print(string2)

# Substituting multiple spaces with single space
#document = re.sub(r'\s+', ' ', document, flags=re.I)
string3 = "hello      world"
string3 = re.sub(r'\s+', ' ', string3, flags=re.I)
print(string3)

#Converting to lowercase
string4 = "HEllo WORLD"
string4 = string4.lower()
print(string4)

#Removing prefixed 'b'
#document = re.sub(r'^b\s+', '', document)
string5 = "b hello world"
string5 = re.sub(r'^b\s+', '', string5)
print(string5)

#Make dashes into combined words
#document = re.sub(r'\s-\s+', '-', document)
string6 = "hello - world"
string6 = re.sub(r'\s-\s+', '-', string6)
print(string6)

#Make ampersand into combined words
#document = re.sub(r'\s&\s+', '&', document)
string7 = "hel & lo & world"
string7 = re.sub(r'\s&\s+', '&', string7)
print(string7)

#Make underscore into combined words
#document = re.sub(r'\s_\s+', '_', document)
string8 = "hel _ lo wo _ rld"
string8 = re.sub(r'\s_\s+', '_', string8)
print(string8)

hello world
hello world
hello world
hello world
hello world
hello-world
hel&lo&world
hel_lo wo_rld


In [77]:
#removes all punctuation in string that is in removeset
document = "Johnson+;Johnson!"
for i in removeset:
    document=re.sub(re.escape(i),"",document)
print(document)

JohnsonJohnson


<h2>Cleaning NDC</h2>

<h4>Removing the first 25 since they are just numbers</h4>

In [130]:
#Getting rid of the first 25 since those are just numbers
ndc = ndc.iloc[25:]
#renaming column
ndc = ndc.rename(columns = {'Row Labels':'company'})

<h4>Lowercase everything</h4>

In [131]:
#Converting to lower first
ndc.company = ndc.company.str.lower()

<h4>Remove content that are in parentheses</h4>

In [132]:
#Function for removing parentheses content
def removeParenthesesContent(string):
    return re.sub(r'\([^)]*\)', '', string)


In [133]:
ndc['companiesWithoutParensContent'] = ndc['company'].apply(removeParenthesesContent)

In [134]:
del ndc['company']

In [135]:
ndc.head()

Unnamed: 0,companiesWithoutParensContent
25,spironolactone 2%
26,-l'oreal usa products inc
27,.cardinal health
28,.church & dwight canada corp
29,{preferred pharmaeutials inc.


<h4>Remove Unwanted Punctuation</h4>

In [136]:
"""
Ignore 
#Function that uses regex to remove parentheses and square brackets
def removeParenthesis(string):
    return re.sub('[()\{}]', '', string)
"""
#function that gets rid of unwanted punctuation
#This does get rid of ' within a string (ex. l'oreal becomes l oreal) so maybe recheck?
def removeUnwantedPunc(string):
    return re.sub('[!"#\'()*+,./:;<=>?@[\]^`{|}~]', ' ', string)

In [137]:
ndc['companiesWithoutUnwantedPunc'] = ndc['companiesWithoutParensContent'].apply(removeUnwantedPunc)

In [138]:
del ndc['companiesWithoutParensContent']

In [139]:
ndc.rename(columns = {"companiesWithoutUnwantedPunc": "company"}, inplace = True)

In [140]:
#At this point all the companies are lowercased, don't have parenthetical content, and don't have unwanted punctuation
ndc.head()

Unnamed: 0,company
25,spironolactone 2%
26,-l oreal usa products inc
27,cardinal health
28,church & dwight canada corp
29,preferred pharmaeutials inc


<h4>Removing numbers</h4>

In [141]:
#Function that will remove numbers
def removeNumbers(string):
    return re.sub(r'[0-9+]','', string)

In [142]:
ndc['companiesWithNoNumbers'] = ndc.company.apply(removeNumbers)

In [143]:
del ndc['company']

In [144]:
ndc = ndc.rename(columns = {"companiesWithNoNumbers": 'company'})

In [145]:
ndc.head()

Unnamed: 0,company
25,spironolactone %
26,-l oreal usa products inc
27,cardinal health
28,church & dwight canada corp
29,preferred pharmaeutials inc


<h4>Removing Single Chars</h4>

In [146]:
def removeSingleChar(string):
    return re.sub(r'\s+[a-zA-Z]\s+', ' ', string)

In [147]:
ndc['companiesWithNoSingleChar'] = ndc.company.apply(removeSingleChar)

In [148]:
del ndc['company']
ndc = ndc.rename(columns={"companiesWithNoSingleChar": 'company'})

<h4>Substituting Multiple Spaces with Single Space</h4>

In [150]:
ndc.iloc[300:350]

Unnamed: 0,company
325,allure labs inc
326,allure labs inc
327,alma laboratories inc
328,almaject inc
329,almatica pharma inc
330,almay
331,almay inc
332,almirall llc
333,almus
334,alnylam pharmaceuticals inc


<h4>Grabbing the list of legal entities from os github</h4>

In [190]:
legalEntities = pd.read_csv("https://raw.githubusercontent.com/DSPG-Young-Scholars-Program/dspg20oss/danBranch/ossPy/keyFiles/curatedLegalEntitesRaw.csv", quotechar = "'",header = None)
legalEntities.head()

Unnamed: 0,0
0,(?i) Inc\b
1,(?i) Ltd\b
2,(?i) LLC\b
3,(?i) GmbH\b
4,(?i) Corporation\b


<h4>Using Daniel's code to get remove legal entities</h4>

In [173]:
def eraseFromColumn(inputColumn, eraseList):
    "iteratively delete regex query matches from input list"
    
    """
    inputColumn -- a column from a pandas dataframe, this will be the set of
    target words/entries that deletions will be made from
    eraseList -- a column containing strings (regex expressions) which will be
    deleted from the inputColumn, in an iterative fashion
    """
    eraseList['changeNum'] = 0
    eraseList['changeIndexes'] = ''
    
    inputColumn = inputColumn.replace(regex=True, to_replace = "\\\\", value='/')
    
    for index, row in eraseList.iterrows():
        curReplaceVal = row[0]
        currentRegexExpression=re.compile(curReplaceVal)
        CurrentBoolVec=inputColumn.str.contains(currentRegexExpression, na= False)
        eraseList['changeIndexes'].iloc[index]=[i for i, x in enumerate(CurrentBoolVec) if x]
        eraseList['changeNum'].iloc[index] = len(eraseList['changeIndexes'].iloc[index])
        inputColumn.replace(regex=True, to_replace=currentRegexExpression,value='', inplace = True)
    
    return inputColumn, eraseList

In [191]:
output, eraseList = eraseFromColumn(ndc.company, legalEntities)

In [204]:
#This dataframe is lowercased, doesn't have any of the unwanted punctuation, and has removed all legal entity labels
output= pd.DataFrame(output)

<h4>Counting Unique String Tokens using Daniel's code</h4>

Unnamed: 0,company
25,spironolactone 2%
26,-l'oreal usa products inc
27,.cardinal health
28,.church & dwight canada corp
29,{preferred pharmaeutials inc.
