## String matching with Fuzzywuzzy
### Loading required packages

In [3]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
from collections import Counter



In [8]:
fuzz.ratio("Albert Thompson", "Albert G. Thompson") 

91

### Read in xlsx file

In [5]:
fda = pd.read_excel('../data/original/fda_companies.xlsx',
              sheet_name='FDA Company List')
#There are 975 entries

In [4]:

dna = pd.read_csv('../data/working/validcompaniesdictionary.csv')

In [43]:
dna.head(35)

Unnamed: 0.1,Unnamed: 0,Code,Description
0,0,AA,AA PLC
1,2,AAAADJ,"Emperial Americas, Inc."
2,3,AAAAIY,"American Academy of Allergy, Asthma and Immuno..."
3,5,AAABBB,Bird Studies Canada
4,6,AAACK,Aesculap AG & Co. KG
5,8,AAACT,AAA Cooper Transportation
6,9,AAAEMG,ARA Asset Management Limited
7,12,AAAFES,Army and Air Force Exchange Service
8,13,AAAFTS,AAA Foundation for Traffic Safety
9,14,AAAHC,Accreditation Association for Ambulatory Healt...


### Splitting the data frame in half 

In [6]:
fda_sub = fda[487:]
fda_sub.columns

Index(['FDA Companies '], dtype='object')

## Two different methods for fuzzy matching:

The first method does not include any data cleaning, instead we will be using a weighted fuzzymatching to matching the company names from FDA to DNA data. 

The second method will only look at the first word of the company name and group them based on the first word of the company name, the assumption is that first word is representative of the same company. 

### What are the most common words in the entire data frame? 

In [8]:
word_freq = Counter()
for words in fda['FDA Companies '] and dna['Description']:
    word_freq.update(str(words).split(" "))

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [6]:
dna_word_freq = Counter()
for words in dna['Description']:
    dna_word_freq.update(str(words).split(" "))

In [31]:
word_freq.most_common(30)

[('PHARMS', 190),
 ('INC', 162),
 ('LLC', 82),
 ('PHARMA', 81),
 ('LABS', 62),
 ('LTD', 58),
 ('PHARM', 51),
 ('CO', 30),
 ('USA', 21),
 ('US', 20),
 ('HLTHCARE', 16),
 ('AND', 15),
 ('TEVA', 13),
 ('CORP', 13),
 ('INTL', 13),
 ('ACTAVIS', 12),
 ('THERAP', 12),
 ('MYLAN', 12),
 ('MEDCL', 11),
 ('THERAPS', 10),
 ('PERRIGO', 10),
 ('VALEANT', 10),
 ('AIR', 7),
 ('LP', 7),
 ('AMERICA', 7),
 ('HOLDINGS', 7),
 ('MEDICAL', 7),
 ('RES', 7),
 ('UNIV', 7),
 ('NORTH', 7)]

In [7]:
dna_word_freq.most_common(30)

[('Inc', 10371),
 ('Inc.', 9261),
 ('Ltd', 5618),
 ('of', 3159),
 ('Group', 2952),
 ('LLC', 2800),
 ('Corporation', 2723),
 ('Limited', 2330),
 ('Corp', 2239),
 ('&', 2086),
 ('Company', 1927),
 ('Ltd.', 1895),
 ('International', 1704),
 ('Holdings', 1652),
 ('PLC', 1527),
 ('Co', 1508),
 ('and', 1228),
 ('Capital', 1222),
 ('The', 1145),
 ('Health', 1118),
 ('Medical', 1076),
 ('Corp.', 1069),
 ('Bank', 1066),
 ('SA', 1041),
 ('University', 1018),
 ('Technologies', 977),
 ('Energy', 951),
 ('AG', 922),
 ('Association', 901),
 ('Partners', 877)]

In [39]:
#This takes only the first word of the company name and adds to a list
abb_company = set()
for company in fda_sub['FDA Companies ']:
    abb_company.add(company.split(" ", 1)[0])
    

len(abb_company)
#fda_sub.head(10)

323

In [47]:
innov_comp = []
for name in dna['Description']:
    match = process.extractOne(name, abb_company)
    innov_comp.append(name)
    innov_comp.append(match)
#    if  fuzz.ratio(name, abb_company) > 85:

In [53]:
innov_comp[0:100]

['AA PLC',
 ('PLX', 60),
 'Emperial Americas, Inc.',
 ('NCM', 72),
 'American Academy of Allergy, Asthma and Immunology',
 ('MERIDIAN', 68),
 'Bird Studies Canada',
 ('SECAN', 72),
 'Aesculap AG & Co. KG',
 ('UCLA', 68),
 'AAA Cooper Transportation',
 ('TAIHO', 72),
 'ARA Asset Management Limited',
 ('TEDOR', 68),
 'Army and Air Force Exchange Service',
 ('VANDA', 72),
 'AAA Foundation for Traffic Safety',
 ('WATSON', 60),
 'Accreditation Association for Ambulatory Health Care',
 ('RELIANT', 64),
 'Anhui Anli Material Technology Co Ltd',
 ('MATRIX', 75),
 'AAA Mid-Atlantic Inc',
 ('NCM', 72),
 'Cheniere Energy Partners, L.P.',
 ('NEOS', 68),
 'A&A Pharmachem Inc',
 ('PHARM', 90),
 'Assa Abloy Australia Pacific Pty Ltd',
 ('PACIFIC', 90),
 'American Association of Advertising Agencies',
 ('RISING', 75),
 'Al al-Bayt University',
 ('UNIV', 90),
 'AABB',
 ('MUSTAFA', 51),
 'AaB A/S',
 ('SB', 60),
 'Kreditbanken A/S',
 ('NEW', 60),
 'ABN Amro Bank (Luxembourg) S.A.',
 ('TARO', 68),
 'Allis