# Program to clean assignee data from Patent View
Input files:
rawassignee.tsv

Output files:
rawpv_clean_23apr20.csv


In [1]:
import pandas as pd
import numpy as np
from cleanco import cleanco
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
import  re
import urllib.request as urllib2
import json
import datetime

### rawassignee.tsv

In [2]:
df = pd.read_csv("rawassignee.tsv", delimiter='\t')

In [3]:
len(df) # no of observations in assignee.tsv is 6,387,373

6387373

In [4]:
df.head()

Unnamed: 0,uuid,patent_id,assignee_id,rawlocation_id,type,name_first,name_last,organization,sequence
0,0000p94wkezw94s8cz7dbxlvz,5856666,org_fijoKOoRhIzrkYzecWF9,orskbf54s58e97lkmw8na5rpx,2,,,U.S. Philips Corporation,0
1,00013vk881wap9u4mbo7lwwhp,5204210,org_UrbE3xev7LUsnuvRjbep,mue862v5lcjdhzqqk86ei75kj,2,,,Xerox Corporation,0
2,000192sn2u10kzpikl4s7h3r0,5302149,org_JcXwBlJtb1uvcPKHeaYX,o1h9dqdv0yq7dt1b1vmrcal9h,3,,,Commonwealth Scientific & Industrial Research ...,1
3,0001ycvv6sz1ju07ss99nhxi1,9104354,org_7fE5f5nnY6dbOc3vSaXb,rspbpqcajvm09r1ew9mgnpx37,3,,,Canon Kabushiki Kaisha,0
4,0001z7ws4m14aqdb3tv99u550,6584517,org_sj7olrHxASyJDNVGczBe,l1gyelp5jcg0hakk9smmhsdgr,2,,,Cypress Semiconductor Corp.,0


In [5]:
df[['uuid','organization','patent_id','assignee_id']].nunique()

uuid            6387373
organization     682984
patent_id       6177084
assignee_id      486381
dtype: int64

In [6]:
df.columns

Index(['uuid', 'patent_id', 'assignee_id', 'rawlocation_id', 'type',
       'name_first', 'name_last', 'organization', 'sequence'],
      dtype='object')

In [7]:
# remove redundant columns
del df['name_last'], df['name_first'], df['uuid'], df['rawlocation_id']

In [8]:
# remove organization is nan
df = df[~df['organization'].isna()]

In [9]:
df.nunique()

patent_id       6126629
assignee_id      438318
type                 18
organization     682984
sequence             16
dtype: int64

assignee type (1- Unassigned, 2 - US Company or Corporation, 3 - Foreign Company or Corporation, 4 - US Individual, 5 - Foreign Individual, 6 - US Federal Government, 7 - Foreign Government, 8 - US County Government, 9 - US State Government. Note: A "1" appearing before any of these codes signifies part interest)

Sequence: order in which assignee appears in patent file

In [10]:
len(df)

6318286

In [11]:
df_corp = df[df['type'].isin([2,3])]

In [12]:
df_corp.nunique() 

patent_id       6070872
assignee_id      436099
type                  2
organization     678463
sequence             16
dtype: int64

In [13]:
df.nunique()

patent_id       6126629
assignee_id      438318
type                 18
organization     682984
sequence             16
dtype: int64

In [14]:
df_corp.isna().sum()

patent_id       0
assignee_id     0
type            0
organization    0
sequence        0
dtype: int64

In [15]:
# remove any duplicates
df2 = df_corp[['assignee_id','organization']]

In [16]:
df2.nunique() #assignee_id 436099; irganization = 678463

assignee_id     436099
organization    678463
dtype: int64

In [17]:
df2 = df2.drop_duplicates(keep = 'first')

In [18]:
len(df2)

684849

In [19]:
df2['organization'] = df2['organization'].str.encode('ascii', 'ignore').str.decode('ascii')

In [20]:
df2.nunique() #677775

assignee_id     436099
organization    677775
dtype: int64

In [21]:
#################################################

In [22]:
# remove text from braces
df2['brace'] = df2['organization'].apply(lambda x: re.findall(r"\(.*\)",x))

In [23]:
df2['bracelen'] = df2['brace'].apply(lambda x: len(x))
df2['bracelen'].value_counts() # bracelen text in braces

0    670356
1     14493
Name: bracelen, dtype: int64

In [24]:
#extract text from brace as string
df2.loc[df2['bracelen']==1, 'bractext']  = df2[df2['bracelen']==1]['brace'].apply(lambda x: x[0][1:-1])

In [25]:
frq = df2[~df2['bractext'].isnull()]['bractext'].value_counts()

In [26]:
frq[(frq==5)]

ATO-DLO          5
Huizhou          5
San Diego        5
Cird Galderma    5
FPD              5
                ..
ICREA            5
A-P              5
Belgium          5
KGaA             5
UEB              5
Name: bractext, Length: 78, dtype: int64

In [27]:
# ANVAR, Kobe Steel, Ltd. ,Henkel KGaA ,USINOR,SEPM,KIGAM,RWTH,,APHP

In [28]:
df2['bractext'].fillna("", inplace=True)

In [29]:
# remove text from braces
df2['organization2'] = df2['organization'].str.replace(r"\(.*\)","")

In [30]:
df2[df2['organization2']==""]

Unnamed: 0,assignee_id,organization,brace,bracelen,bractext,organization2
841917,org_4LpvuqQrSxPINNkwHmIG,"(Kobe Steel, Ltd.)","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",
2592662,org_m1wjmqxUyvyUHgBZoadC,(Sasol Technology (Proprietary) Limited),[(Sasol Technology (Proprietary) Limited)],1,Sasol Technology (Proprietary) Limited,
6132223,org_IWh3ZcciYsd6R9mI8Nkm,(INSERM) (Institut National de la Sante et de ...,[(INSERM) (Institut National de la Sante et de...,1,INSERM) (Institut National de la Sante et de l...,


In [31]:
df2.loc[df2.organization2=="","organization2" ] = df2.loc[df2.organization2=="","bractext"]

In [32]:
df2[df2['organization2']==""]

Unnamed: 0,assignee_id,organization,brace,bracelen,bractext,organization2


In [33]:
df2.columns

Index(['assignee_id', 'organization', 'brace', 'bracelen', 'bractext',
       'organization2'],
      dtype='object')

In [34]:
df2.bractext.value_counts()

                                  670356
UK                                   485
Proprietary                          386
Singapore                            264
Shanghai                             259
                                   ...  
Jiashan                                1
PIMS                                   1
a French Societe Anonyme               1
Idemitsu Kosan Co., Ltd.               1
Design Screen Haute Definition         1
Name: bractext, Length: 4316, dtype: int64

In [35]:
df2[(df2['bractext']=="UK")]['organization'] #Trailmor [Proprietary] Limited #Oxford Instruments (UK) Limited

954                Oxford Instruments (UK) Limited
13272                    Premium Genetics (UK) LTD
26509                         Meggitt (UK) Limited
27081            Dialog Semiconductor (UK) Limited
27230                Oxford Biomedica (UK) Limited
                            ...                   
6262472              Martin Manufacturing (UK) PLC
6266809              Wagon Automotive (UK) Limited
6270064    Bowe Systems and Machinery (UK) Limited
6290531                     Equipbaby (UK) Limited
6306129                     ITF Licensing (UK) LTD
Name: organization, Length: 485, dtype: object

In [36]:
# remove text from braces
df2['organization3'] = df2['organization2'].str.replace(r"\(.*\)","")

In [37]:
df2.loc[df2['bractext'].str.contains("Kobe Steel"), "organization3"] = "Kobe Steel, Ltd."

In [38]:
df2[df2['bractext'].str.contains("Kobe")]

Unnamed: 0,assignee_id,organization,brace,bracelen,bractext,organization2,organization3
7031,org_B2jlMPhNiLEbDrkLzyUa,"Kabushiki Kaisha Kobe Seiko Sho (Kobe Steel, L...","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",Kabushiki Kaisha Kobe Seiko Sho,"Kobe Steel, Ltd."
102358,org_4LpvuqQrSxPINNkwHmIG,"Kabushiki Kaisha (Kobe Steel, Ltd.)","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",Kabushiki Kaisha,"Kobe Steel, Ltd."
799330,org_B2jlMPhNiLEbDrkLzyUa,Kabushiki Kaisha Kobe Seiko Sho (Kobe Steel Ltd.),[(Kobe Steel Ltd.)],1,Kobe Steel Ltd.,Kabushiki Kaisha Kobe Seiko Sho,"Kobe Steel, Ltd."
841917,org_4LpvuqQrSxPINNkwHmIG,"(Kobe Steel, Ltd.)","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.","Kobe Steel, Ltd.","Kobe Steel, Ltd."
1748079,org_VnBMyp004Q1wOnzQ0ohl,"Kabushiki Kaisha Kobe Sieko Sho (Kobe Steel, L...","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",Kabushiki Kaisha Kobe Sieko Sho,"Kobe Steel, Ltd."
3212511,org_uuvhltTWQnh0HjyWEX5u,"Kabuhsiki Kaisha Kobe Seiko Sho (Kobe Steel, L...","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",Kabuhsiki Kaisha Kobe Seiko Sho,"Kobe Steel, Ltd."
4411669,org_B2jlMPhNiLEbDrkLzyUa,"Kabushiki Kaisha Kobe Seiko Sho (Kobe Steel, Ltd)","[(Kobe Steel, Ltd)]",1,"Kobe Steel, Ltd",Kabushiki Kaisha Kobe Seiko Sho,"Kobe Steel, Ltd."
4767778,org_B2jlMPhNiLEbDrkLzyUa,Kabushiki Kaisha Kobe Seiko Sho (Kobe Steel Ltd).,[(Kobe Steel Ltd)],1,Kobe Steel Ltd,Kabushiki Kaisha Kobe Seiko Sho .,"Kobe Steel, Ltd."
5326159,org_VnBMyp004Q1wOnzQ0ohl,"Kabushiki Kaisha Kobe Seiko Sho (Kobe Steel, L...","[(Kobe Steel, Ltd.)]",1,"Kobe Steel, Ltd.",Kabushiki Kaisha Kobe Seiko Sho Steel,"Kobe Steel, Ltd."


In [39]:
###check text from square braces

In [40]:
# remove text from braces
df2['brace1'] = df2['organization3'].apply(lambda x: re.findall(r"\[.*\]",x))

In [41]:
df2['bracelen1'] = df2['brace1'].apply(lambda x: len(x))
df2['bracelen1'].value_counts() # bracelen text in braces

0    684824
1        25
Name: bracelen1, dtype: int64

In [42]:
# extract text from brace as string
df2.loc[df2['bracelen1']==1, 'bractext1']  = df2[df2['bracelen1']==1]['brace1'].apply(lambda x: x[0][1:-1])

In [43]:
df2[~df2['bractext1'].isnull()]['organization3'].value_counts().index

Index(['[27]7.ai, Inc.', 'Rand Steel Technology [Proprietary] Limited',
       'Reuter Chemische Apparatebau KG [DE/DE] ', 'Generics [UK] Limited',
       'Generics [UK]Limited', 'Generics [UK]  Limited',
       'Pacific Biomedical Research, Inc. [Cell Mart, Inc.]',
       'P. Robertet & Cie [Societe Anomyme]',
       'BASF Lacke + Farben Aktiengesellschaft[DE/DE]',
       '[Bu:st]GmbH Beratungsunternehmen fr Systeme und Technologien',
       'Syminex [Societe Anonyme]',
       'Kinetic Energy Corporation [a wholly owned subsidiary of SolarWindow Technologies, Inc.]',
       '[X+1] Solutions, Inc.', '[24] 7.ai, Inc.',
       'DIRECTOR GENERAL, DEFENCE RESEARCH & DEVELOPMENT ORGANIZATION [DRDO]',
       'BRY AIR [ASIA] PVT. LTD.', '[24]7 .AI, INC.', '[24]7.AI, INC.',
       'True[X] Media Inc.', 'Textil Copper Andino S.A. [CL/CL]',
       '[24] 7 .ai, Inc.', 'Reuter Chemische Apparatebau KG [DE/DE]',
       '[24]7.ai, Inc.', '[24]7.AI, Inc.', 'Trailmor [Proprietary] Limited'],
      dty

In [44]:
df2['bractext1'].value_counts().index

Index(['24', 'DE/DE', 'UK', 'Proprietary',
       'a wholly owned subsidiary of SolarWindow Technologies, Inc.', 'Bu:st',
       'Societe Anonyme', 'X+1', 'X', 'Cell Mart, Inc.', 'Societe Anomyme',
       'ASIA', '27', 'CL/CL', 'DRDO'],
      dtype='object')

In [45]:
lst = ['UK', 'DE/DE', 'ASIA',  'Proprietary',
       'a wholly owned subsidiary of SolarWindow Technologies, Inc.',
       'Societe Anomyme', 'Bu:st', 'DRDO', 'Cell Mart, Inc.', 'CL/CL',
       'Societe Anonyme']

df2.loc[df2['bractext1'].isin(lst),"organization3"] = df2.loc[df2['bractext1'].isin(lst),"organization3"].str.replace(r"\[.*\]","")

In [46]:
df2[df2['bracelen1']==1][['organization','organization3']]

Unnamed: 0,organization,organization3
101286,Trailmor [Proprietary] Limited,Trailmor Limited
114949,Generics [UK] Limited,Generics Limited
273573,[Bu:st]GmbH Beratungsunternehmen fr Systeme un...,GmbH Beratungsunternehmen fr Systeme und Techn...
297148,"[X+1] Solutions, Inc.","[X+1] Solutions, Inc."
405573,"[24]7.ai, Inc.","[24]7.ai, Inc."
477425,Generics [UK]Limited,Generics Limited
583327,"[24]7.AI, Inc.","[24]7.AI, Inc."
1594156,Textil Copper Andino S.A. [CL/CL],Textil Copper Andino S.A.
1619350,"[24] 7.ai, Inc.","[24] 7.ai, Inc."
1812279,"[24]7.AI, INC.","[24]7.AI, INC."


In [47]:
df2.columns

Index(['assignee_id', 'organization', 'brace', 'bracelen', 'bractext',
       'organization2', 'organization3', 'brace1', 'bracelen1', 'bractext1'],
      dtype='object')

In [48]:
del df2['brace'], df2['bracelen'], df2['brace1'], df2['bracelen1']

In [49]:
df2.columns

Index(['assignee_id', 'organization', 'bractext', 'organization2',
       'organization3', 'bractext1'],
      dtype='object')

In [50]:
df2['bractext1'].fillna("", inplace = True)

In [51]:
df2.isna().sum()

assignee_id      0
organization     0
bractext         0
organization2    0
organization3    0
bractext1        0
dtype: int64

In [52]:
#####################

In [53]:
df2.columns

Index(['assignee_id', 'organization', 'bractext', 'organization2',
       'organization3', 'bractext1'],
      dtype='object')

In [54]:
#convert to lower case

In [55]:
df2['orglow'] = df2['organization3'].apply(lambda x: x.lower()) #601225

In [56]:
df2.orglow.nunique() #601225

601225

In [57]:
df2['orglow2'] = df2['orglow'].str.replace("\br&d\b","research and development")
df2['orglow2'] = df2['orglow2'].str.replace("\br & d\b","research and development")

In [58]:
df2.orglow2.nunique() #601143

601225

In [59]:
df2['orglow2'] = df2['orglow2'].str.replace('\.','', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace(',',' ', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace("'",'', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace("/",' ', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace('"','', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace('-',' ', regex=True)
df2['orglow2'] = df2['orglow2'].str.replace('+',' ', regex=True)
df2['orglow2'] = df2['orglow2'].apply(lambda x: x.strip())

In [60]:
df2.orglow2.nunique() #544223

544305

In [61]:
###join single characters
regex = re.compile('(?<![a-zA-Z0-9]{2})(?<=[a-zA-Z0-9]{1}) +(?=[a-zA-Z0-9] |.$)')
df2['orglow2'] = df2['orglow2'].str.replace(regex, "")

In [62]:
df2['orglow2'].nunique() #541410

541491

In [63]:
df2['orglow2'] = df2['orglow2'].str.replace(' +', ' ')
df2['orglow2'].nunique() #504908

504990

In [64]:
df2["orglow2"] = df2["orglow2"].str.replace(" univ$",' university')
df2["orglow2"] = df2["orglow2"].str.replace(" univ ",' university ')
df2["orglow2"] = df2["orglow2"].str.replace(" mfg$",' manufacturing')
df2["orglow2"] = df2["orglow2"].str.replace(" mfg ",' manufacturing ')
df2["orglow2"] = df2["orglow2"].str.replace(" mfrs$",' manufacturers')
df2["orglow2"] = df2["orglow2"].str.replace(" mfrs ",' manufacturers ')

In [65]:
df2.orglow2.nunique() #504355

504437

In [66]:
df2['last'] = df2['orglow2'].apply(lambda x: x.split()[-1])

In [67]:
cnt = df2['last'].value_counts() #kaisha, kaisa

In [68]:
# cnt[(cnt>20)&(cnt<=23)].index

In [69]:
df2['orglow3'] = df2['orglow2']

In [70]:
df2[df2.orglow3.str.endswith(" corportion")]['orglow3']

7738       international business machines corportion
15377                                  emc corportion
37732                          nitto denko corportion
44217                             schering corportion
188444                            broadcom corportion
                              ...                    
6207869                          mobil oil corportion
6214519                        dow corning corportion
6240517                          albemarle corportion
6301344                united technologies corportion
6319922                       smith corona corportion
Name: orglow3, Length: 182, dtype: object

In [71]:
compend = ['public limited company','et al','ab', 'ag', 'akteingesellschaft', 'aktiebolag', 'aktiegesellschaft', 'aktiengesellchaft', 
               'aktiengesellschaft', 'aktiengesellshaft', 'aktiengessellschaft','aktiengesselschaft',
             'aktiengsellschaft','aps','arl','as',  'asa','assoc','associates','association',
             'atkiengesellschaft','bv', 'bvba','et cie',
               'sab de cv','limited company'
               'sa de cv',
               'ca','et cie', 'cie', 'cie ltee', '& co', '& c',
               'co', 'co l','comany',
               'cc','comapny','incorporee',
             'company', 'compnay', 'cooperation','coporation', 'coproration', 'cororation', 'corp','corporaiton',
             'corporated','corporatiion', 'corporatioin','corporation', 'corporation', 'corporations',
               'corportion',
            'corporatoin', 'corporaton','corportation','corproation','cooperatieve vennootschap','cv',
               'doo','ec',
               'ehf','ev',
             'gbr','gesmbh','gmbh','hf','ilp','in','inc','incoporated','incorporated','incorporates'
             'incorporation','incorported','ind','international','ip','is','intl','internatinal',
             'kabushiki kaisa','kabushiki kaisha', 'kabushiki kaishi','kabushiki kaisya','kabushikigaisha',
             'kabushikikaisha','kabushiki','kft','kg','kgaa','kk','kommanditgesellschaft','koncernovy podnik',
               'pty lt','pty ltd',
               'lc','lcc',
             'lda','limitada','limite','limited','limitee','limted','limtied','llc','lllp','llp','lp','lt','ltc',
            'ltd','ltda','lte','ltee','mbh','na','narodni podnik','naamloze vennootschap','nv','ohg','oy','oyj',
               'partnership','pc',
             'plc','pllc','podnik','pte','pty','roc','rt','s ar l','sa rl'
               's arl','sa','sarl',
               'sas','sau','sdi','sdn bhd',
             'se','sec','seisakusho','sho','slu','snc','societa per azioni','sp zoo','spa','srl','sro','ssr',
             'sssr','sud','the','ua','ulc','unlimited','vof','vzw','zo o','z oo']  
    

In [72]:
compend

['public limited company',
 'et al',
 'ab',
 'ag',
 'akteingesellschaft',
 'aktiebolag',
 'aktiegesellschaft',
 'aktiengesellchaft',
 'aktiengesellschaft',
 'aktiengesellshaft',
 'aktiengessellschaft',
 'aktiengesselschaft',
 'aktiengsellschaft',
 'aps',
 'arl',
 'as',
 'asa',
 'assoc',
 'associates',
 'association',
 'atkiengesellschaft',
 'bv',
 'bvba',
 'et cie',
 'sab de cv',
 'limited companysa de cv',
 'ca',
 'et cie',
 'cie',
 'cie ltee',
 '& co',
 '& c',
 'co',
 'co l',
 'comany',
 'cc',
 'comapny',
 'incorporee',
 'company',
 'compnay',
 'cooperation',
 'coporation',
 'coproration',
 'cororation',
 'corp',
 'corporaiton',
 'corporated',
 'corporatiion',
 'corporatioin',
 'corporation',
 'corporation',
 'corporations',
 'corportion',
 'corporatoin',
 'corporaton',
 'corportation',
 'corproation',
 'cooperatieve vennootschap',
 'cv',
 'doo',
 'ec',
 'ehf',
 'ev',
 'gbr',
 'gesmbh',
 'gmbh',
 'hf',
 'ilp',
 'in',
 'inc',
 'incoporated',
 'incorporated',
 'incorporatesincorporatio

In [73]:
def clean(colname, col):

    compend = ['public limited company','et al','ab', 'ag', 'akteingesellschaft', 'aktiebolag', 'aktiegesellschaft', 'aktiengesellchaft', 
               'aktiengesellschaft', 'aktiengesellshaft', 'aktiengessellschaft','aktiengesselschaft',
             'aktiengsellschaft','aps','arl','as',  'asa','assoc','associates','association',
             'atkiengesellschaft','bv', 'bvba','et cie',
               'sab de cv','limited company'
               'sa de cv',
               'ca','et cie', 'cie', 'cie ltee', '& co', '& c',
               'co', 'co l','comany',
               'cc','comapny',
             'company', 'compnay', 'cooperation','coporation', 'coproration', 'cororation', 'corp','corporaiton',
             'corporated','corporatiion', 'corporatioin','corporation', 'corporation', 'corporations',
               'corportion',
            'corporatoin', 'corporaton','corportation','corproation','cooperatieve vennootschap','cv',
               'doo','ec',
               'ehf','ev',
             'gbr','gesmbh','gmbh','hf','ilp','in','inc','incoporated','incorporated','incorporates'
             'incorporation','incorported','ind','international','ip','is','intl','internatinal',
             'kabushiki kaisa','kabushiki kaisha', 'kabushiki kaishi','kabushiki kaisya','kabushikigaisha',
             'kabushikikaisha','kabushiki','kft','kg','kgaa','kk','kommanditgesellschaft','koncernovy podnik',
               'pty lt','pty ltd',
               'lc','lcc',
             'lda','limitada','limite','limited','limitee','limted','limtied','llc','lllp','llp','lp','lt','ltc',
            'ltd','ltda','lte','ltee','mbh','na','narodni podnik','naamloze vennootschap','nv','ohg','oy','oyj',
               'partnership','pc',
             'plc','pllc','podnik','pte','pty','roc','rt','s ar l','sa rl'
               's arl','sa','sarl',
               'sas','sau','sdi','sdn bhd',
             'se','sec','seisakusho','sho','slu','snc','societa per azioni','sp zoo','spa','srl','sro','ssr',
             'sssr','sud','the','ua','ulc','unlimited','vof','vzw','zo o','z oo']  
    
    for each in compend:
        if (len(df2[df2[colname].str.endswith(each)]) >0 ):
            
            print(each,len(df2[df2[colname].str.endswith(each)]) )
            
            y  = " "+each + "$"
            
            df2.loc[df2[colname].str.endswith(each),col] = df2.loc[df2[colname].str.endswith(each),colname].str.replace(y,"")

            df2[col] = df2[col].apply(lambda x: x.strip())
            
#             return (df_corp)
            

In [74]:
clean('orglow2','orglow3')

public limited company 120
et al 91
ab 7398
ag 10800
akteingesellschaft 29
aktiebolag 682
aktiegesellschaft 28
aktiengesellchaft 22
aktiengesellschaft 1721
aktiengesellshaft 49
aktiengessellschaft 45
aktiengesselschaft 24
aktiengsellschaft 35
aps 992
arl 1017
as 6798
asa 189
assoc 29
associates 716
association 244
atkiengesellschaft 31
bv 6750
bvba 216
et cie 102
sab de cv 17
ca 1051
et cie 102
cie 445
cie ltee 7
& co 2810
& c 227
co 9971
co l 4
comany 28
cc 143
comapny 32
company 13807
compnay 31
cooperation 77
coporation 263
coproration 39
cororation 39
corp 12956
corporaiton 60
corporated 7447
corporatiion 23
corporatioin 29
corporation 40025
corporation 40025
corporations 72
corportion 182
corporatoin 21
corporaton 107
corportation 34
corproation 80
cooperatieve vennootschap 2
cv 707
doo 153
ec 544
ehf 66
ev 1165
gbr 122
gesmbh 171
gmbh 22752
hf 157
ilp 9
in 958
inc 190257
incoporated 45
incorporated 7421
incorported 30
ind 67
international 1460
ip 1040
is 771
intl 32
kabushiki kai

In [75]:
df2['orglow3'].isna().sum()

0

In [76]:
df2['orglow3'].nunique() #459460

459542

In [77]:
df2['orglow4'] = df2['orglow3']

In [78]:
clean('orglow4','orglow4')

public limited company 1
et al 4
ab 1074
ag 855
aktiebolag 7
aktiengesellschaft 26
aktiengesselschaft 1
aps 105
arl 37
as 2508
asa 249
assoc 59
associates 1952
association 109
bv 42
et cie 16
ca 4864
cie 174
& co 8179
& c 259
co 59122
comany 16
cc 134
comapny 28
company 11772
compnay 24
cooperation 10
cororation 1
corp 575
corporated 40
corporatioin 29
corporation 1552
corporation 274
corporations 11
corportion 4
corporatoin 21
corporaton 1
cv 42
doo 9
ec 3775
ehf 66
ev 228
gbr 5
gesmbh 181
gmbh 29790
hf 27
ilp 9
in 3446
inc 304
incorporated 37
ind 787
international 11047
ip 1906
is 3360
intl 203
internatinal 20
kabushiki kaisa 16
kabushiki kaisha 42
kabushikikaisha 1
kabushiki 10
kft 2
kg 172
kgaa 5
kk 95
kommanditgesellschaft 4
lc 114
lcc 5
lda 14
limite 1
limited 1207
limitee 1
limted 9
llc 56
lllp 49
llp 705
lp 100
lt 773
ltc 5
ltd 535
ltda 2
lte 39
ltee 10
mbh 377
na 2178
nv 72
ohg 11
oy 590
oyj 2
partnership 102
pc 166
plc 7
pllc 1
pte 1909
pty 8041
roc 61
rt 2571
sa 3618
sarl 5


In [79]:
df2['orglow4'].isna().sum()

0

In [80]:
clean('orglow4','orglow4')

ab 1202
ag 1243
aktiebolag 3
aktiengesellschaft 15
aps 123
arl 48
as 2891
asa 100
associates 107
association 1
bv 26
ca 5049
cie 43
& co 213
& c 10
co 5286
comany 1
cc 141
company 235
compnay 1
cooperation 1
corp 285
corporated 12
corporation 448
corporation 274
corporations 7
cv 26
doo 9
ec 4302
ev 235
gbr 5
gesmbh 7
gmbh 65
hf 27
ilp 2
in 3605
inc 131
incorporated 9
ind 278
international 394
ip 1083
is 3425
intl 8
internatinal 1
kabushiki kaisha 26
kabushikikaisha 1
kft 2
kg 157
kgaa 8
kk 94
kommanditgesellschaft 2
lc 90
lcc 5
lda 14
limite 1
limited 40
limitee 1
limted 3
llc 35
lllp 3
llp 3
lp 91
lt 787
ltc 5
ltd 87
ltda 1
lte 38
ltee 1
mbh 95
na 2150
nv 27
ohg 8
oy 430
partnership 2
pc 155
plc 5
pllc 1
pte 8
pty 5
roc 60
rt 2579
sa 3559
sarl 2
sas 120
sau 10
sdi 11
se 4256
sec 102
seisakusho 32
sho 560
snc 22
societa per azioni 8
spa 68
srl 40
sro 6
ssr 13
sud 21
the 30
ua 144
unlimited 1
vzw 2
z oo 3


In [81]:
df2['orglow4'].isna().sum()

0

In [82]:
df2['orglow4'].nunique() #443399

443846

In [83]:
df2['orglow5'] = df2['orglow4']

In [84]:
df2[df2.orglow5.str.endswith("illinois")]['orglow5'].unique()

array(['owens illinois',
       'the board of trustees of the university of illinois',
       'board of trustees of the university of illinois',
       'university of illinois', 'the board of trustees of the illinois',
       'its academic of illinois',
       'the board of trustrees of the university of illinois',
       'the board of trustees of university of illinois',
       'board of trustees of university of illinois',
       'board of trustees university of illinois',
       'the board of trustess of the university of illinois',
       'printpack illinois', 'isotech of illinois',
       'the board of trustees of the of the university of illinois',
       'the board of trustees of the universtiy of illinois',
       'the board of trustees of the univerity of illinois',
       'the board of trustees of the university of illinois a body corporate and politic of the state of illinois',
       'maploca of illinois', 'conservation technology of illinois',
       'the university of ill

In [85]:
def geoclean(colname, col):
    
    compend = ['north america','america', 'berlin','denmark','finland','france',
             'il',
             'ind','india','ireland','japan',
             'ky','nederland','new zealand','ny', 'uk','us','usa','italia',"israel", "netherlands","atlanta",
               "pennsylvania","new england" "new holland",'united states'] 

   
    for each in compend:

        if (len(df2[df2[colname].str.endswith(each)]) >0 ):
            
            print(each,len(df2[df2[colname].str.endswith(each)]) )

            y  = " "+each + "$"
            oe = "of "+ each
            ot = 'of the '+ each
            df2.loc[(df2[colname].str.endswith(each))&(~(df2[colname].str.endswith(oe))&(~df2[colname].str.endswith(ot))),
                    col] = df2.loc[(df2[colname].str.endswith(each))&(~(df2[colname].str.endswith(oe))&(~df2[colname].str.endswith(ot))),colname].str.replace(y,"")

            df2[col] = df2[col].apply(lambda x: x.strip())
            
#             return (df_corp)

In [86]:
geoclean('orglow5','orglow5')

north america 913
america 2049
berlin 189
denmark 69
finland 125
france 973
il 1630
ind 277
india 75
ireland 260
japan 762
ky 257
nederland 202
new zealand 100
ny 735
uk 824
us 2968
usa 2783
italia 544
israel 274
netherlands 176
atlanta 32
pennsylvania 74
united states 11


In [87]:
df2['orglow5'].isna().sum()

0

In [88]:
df2.orglow5.nunique() #441065

441534

In [89]:
df2[df2['orglow4'].str.endswith("united states")]['orglow5'].unique()

array(['ici', 'wolters kluwer', 'mayreder consult of the united states',
       'esb', 'nasa an agency of the united states',
       'sb pharmco puerto rico inc of the united states',
       'the equitable life assurance society of the united states'],
      dtype=object)

In [90]:
df2.orglow5.nunique() #441083

441534

In [91]:
df2['orglow6'] = df2['orglow5']

In [92]:
clean('orglow6','orglow6')

ab 1221
ag 971
aktiebolag 3
aktiengesellschaft 12
aps 125
arl 48
as 2956
asa 100
associates 8
association 1
bv 20
ca 3164
cie 15
& co 2
& c 1
co 5304
comany 1
cc 153
company 54
cooperation 1
coporation 1
corp 296
corporated 8
corporation 342
corporation 276
corporations 7
cv 27
doo 9
ec 4411
ev 249
gbr 5
gesmbh 7
gmbh 72
hf 27
ilp 2
in 3526
inc 143
incorporated 5
ind 278
international 65
ip 907
is 3511
kabushiki kaisha 27
kabushikikaisha 1
kabushiki 2
kft 2
kg 157
kgaa 3
kk 95
kommanditgesellschaft 2
lc 97
lcc 5
lda 15
limite 1
limited 30
limitee 1
limted 3
llc 39
lllp 3
llp 3
lp 94
lt 802
ltc 5
ltd 91
ltda 1
lte 39
ltee 1
mbh 95
na 2186
nv 27
ohg 8
oy 435
partnership 2
pc 165
plc 5
pllc 1
pte 8
pty 5
roc 60
rt 2611
sa 891
sarl 2
sas 121
sau 10
sdi 14
se 4305
sec 102
seisakusho 31
sho 561
snc 22
societa per azioni 8
spa 68
srl 40
sro 6
ssr 13
sud 21
the 38
ua 148
ulc 1
unlimited 1
vzw 2
z oo 3


In [93]:
df2['orglow6'].isna().sum()

0

In [94]:
df2.orglow6.nunique() #441329

441411

In [95]:
df2['orglow6'] = df2["orglow6"].str.replace("&",' ')
df2['orglow6'] = df2['orglow6'].apply(lambda x: x.strip())
df2['orglow6'] = df2['orglow6'].str.replace(' +', ' ')
regex = re.compile('(?<![a-zA-Z0-9]{2})(?<=[a-zA-Z0-9]{1}) +(?=[a-zA-Z0-9] |.$)')
df2['orglow6'] = df2['orglow6'].str.replace(regex, "")

In [96]:
df2.orglow6.nunique() #439249

439606

In [97]:
df2.columns

Index(['assignee_id', 'organization', 'bractext', 'organization2',
       'organization3', 'bractext1', 'orglow', 'orglow2', 'last', 'orglow3',
       'orglow4', 'orglow5', 'orglow6'],
      dtype='object')

In [98]:
del  df2['last']

In [99]:
df2['orglow6'] = df2['orglow6'].apply(lambda x: x.strip())

In [100]:
df2.orglow6.nunique() #439249

439606

In [101]:
df2['last'] = df2['orglow6'].apply(lambda x: x.split()[-1])

In [102]:
cnt = df2['last'].value_counts()

In [103]:
df2[df2['last']=="e"][['orglow','orglow5']]

Unnamed: 0,orglow,orglow5
15255,"ste d'application plastique, mecanique et elec...",ste dapplication plastique mecanique et electr...
55646,"shuang-ho-e co., ltd.",shuang ho e
100370,"prime research alliance e., inc.",prime research alliance e
123152,"dipl.-ing. schultz, wolfgang e.",dipl ing schultz wolfgang e
152803,"prime research alliance e, inc.",prime research alliance e
...,...,...
5829449,at & e corporation,at & e
5943490,"sphere e, llc",sphere e
6082676,columbus e. aps,columbus e
6201278,crs srl centro ricerche e,crs srl centro ricerche e


In [104]:
cnt[(cnt>100)&(cnt<=115)]

information    115
seal           115
aircraft       115
board          114
safe           114
              ... 
pa             101
assets         101
luxembourg     101
measurement    101
direct         101
Name: last, Length: 66, dtype: int64

In [105]:
# split

In [106]:
df2['orglow7'] = df2['orglow6']

In [107]:
splt = [' gmbh',' corp ',' co ',' limited ',' pty ltd ',' inc '," ab ", " ag ", " ohg ",' company'
        'kabishiki kaisha', 'kabushiki ku kaisha', 'kabnushiki kaisha', 'kab kaisha', ' inc ',' et Compagnie'
         ]
endst = ['ip','die','cokg',' c', 'switzerland', 'texas','deutschland']


for each in splt:
    if (len(df2[df2['orglow7'].str.contains(each)])>0): 
            print(each,len(df2[df2['orglow7'].str.contains(each)]) )
            df2['orglow7'] = df2['orglow7'].apply(lambda x : x.split(each)[0])
            df2['orglow7'] = df2['orglow7'].apply(lambda x: x.strip())

for each in endst:
    if (len(df2[df2['orglow6'].str.endswith(each)])>0): 
            print(each,len(df2[df2['orglow7'].str.endswith(each)]) )
            y = " "+each + "$"
            df2['orglow7'] = df2['orglow7'].apply(lambda x : x.replace(y,""))
            df2['orglow7'] = df2['orglow7'].apply(lambda x: x.strip())

 gmbh 4189
 corp  241
 co  1174
 limited  797
 pty ltd  44
 inc  745
 ab  97
 ag  732
 ohg  26
kabushiki ku kaisha 1
kabnushiki kaisha 1
kab kaisha 1
ip 895
die 242
cokg 6
 c 90
switzerland 145
texas 124
deutschland 472


In [108]:
df2.loc[df2['orglow7'].str.endswith(" of"),"orglow7"] = df2.loc[df2['orglow7'].str.endswith(" of"),"orglow6"]

In [109]:
df2.loc[df2['orglow3'].str.endswith(" de france"),"orglow7"] = df2.loc[df2['orglow3'].str.endswith(" de france"),"orglow7"]



In [110]:
df2.orglow7.nunique()  #435878

436177

In [111]:
print(df2.loc[254,'orglow'])

compagnie internationale pour l'informatique cii-honeywell bull 


In [112]:
#split after cii

In [113]:
for i in df2[df2['orglow7'].str.contains(" cii ")].index:
        df2.loc[i,"orglow7"] = df2.loc[i]["orglow7"].split(" cii ")[-1]

In [114]:
df2.orglow7.nunique() #435878

436166

In [115]:
df2['last'] = df2['orglow7'].apply(lambda x:x.split()[-1])

In [116]:
df2['orglen'] = df2['orglow7'].apply(lambda x: len(x.split()))

In [117]:
lst = df2[df2['orglen']>1]['last'].value_counts()

In [118]:
#mfg to manufacturing; of; ulc,  naamloze vennootschap, cooperatieve vennootschap, et cie, finland, new zealand
#cc, denmark,comapny, il, sdi, arl
# to technology

In [119]:
lst[(lst>45)&(lst<=50)].index

Index(['processes', 'mines', 'camera', 'diamonds', 'smelting', 'clinic',
       'merchandising', 'switch', 'nurnberg', 'norway', 'md', 'all',
       'irrigation', 'produktions', 'clean', 'manfacturing', 'fusion',
       'washington', 'iron', 'benz', 'squibb', 'lumber', 'white', 'harris',
       'nutzfahrzeuge', 'braun', 'metallurgie', 'auto', 'antenna', 'vac',
       'delivery', 'phones', 'city', 'missouri', 'compagnie', 'telegraph',
       'professional', 'robot', 'scientifiques', 'blue', 'kogaku', 'jet',
       'vertriebsgesellschaft', 'electronique', 'tactical', 'mechatronics',
       'biomaterials', 'unico', 'ai', 'tubes', 'screw', 'investors',
       'northern', '3', 'hall', 'microtechnique', 'seimitsu', 'energia',
       'crane', 'go', 'jena', 'wind', 'entwicklung', 'garden', 'int',
       'broadcasting', 'v', 'tecnologia', 'contact', 'doptique', 'tape',
       'methods', 'south', 'lines', 'island', 'pet', 'kunststoff', 'bath',
       'sc', 'teknik', 'angeletti', 'shokai', 'bever

In [120]:
df2[df2['last']=="hk"][['organization','orglow6']]

Unnamed: 0,organization,orglow6
72950,ConvenientPower HK Ltd.,convenientpower hk
118470,Prime View HK Limited,prime view hk
125469,WOW! CREATIONS HK LIMITED,wow! creations hk
159761,MRM HK Ltd.,mrm hk
292077,CM HK LIMITED,cm hk
313423,"BP Children's Product HK Co., Limited",bp childrens product hk
529997,"BP CHILDREN'S PRODUCTS HK CO., LIMITED",bp childrens products hk
562159,vitroTV HK Ltd,vitrotv hk
568498,"BP Children's Products HK Co., Ltd.",bp childrens products hk
593005,BP Children's Products HK Co. Limited,bp childrens products hk


In [121]:
df2[df2['orglow7'].str.contains("british teleco")]

Unnamed: 0,assignee_id,organization,bractext,organization2,organization3,bractext1,orglow,orglow2,orglow3,orglow4,orglow5,orglow6,last,orglow7,orglen
836,org_r8feDy6loObCsF87M7TK,British Telecommunications public limited company,,British Telecommunications public limited company,British Telecommunications public limited company,,british telecommunications public limited company,british telecommunications public limited company,british telecommunications public limited,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3
5993,org_r8feDy6loObCsF87M7TK,"British Telecommuncations, plc",,"British Telecommuncations, plc","British Telecommuncations, plc",,"british telecommuncations, plc",british telecommuncations plc,british telecommuncations,british telecommuncations,british telecommuncations,british telecommuncations,telecommuncations,british telecommuncations,2
18619,org_r8feDy6loObCsF87M7TK,British Telecommunications,,British Telecommunications,British Telecommunications,,british telecommunications,british telecommunications,british telecommunications,british telecommunications,british telecommunications,british telecommunications,telecommunications,british telecommunications,2
32141,org_r8feDy6loObCsF87M7TK,BRITISH TELECOMMUNICATIONS public limited company,,BRITISH TELECOMMUNICATIONS public limited company,BRITISH TELECOMMUNICATIONS public limited company,,british telecommunications public limited company,british telecommunications public limited company,british telecommunications public limited,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3
53185,org_r8feDy6loObCsF87M7TK,British Telecommunications plc,,British Telecommunications plc,British Telecommunications plc,,british telecommunications plc,british telecommunications plc,british telecommunications,british telecommunications,british telecommunications,british telecommunications,telecommunications,british telecommunications,2
53366,org_r8feDy6loObCsF87M7TK,British Telecommunications public limited comp...,,British Telecommunications public limited comp...,British Telecommunications public limited comp...,,british telecommunications public limited comp...,british telecommunications public limited company,british telecommunications public limited,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3
70120,org_r8feDy6loObCsF87M7TK,British Telecommunications Public Limited Company,,British Telecommunications Public Limited Company,British Telecommunications Public Limited Company,,british telecommunications public limited company,british telecommunications public limited company,british telecommunications public limited,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3
88593,org_r8feDy6loObCsF87M7TK,BRITISH TELECOMMUNICATIONS PUBLIC LIMITED COMPANY,,BRITISH TELECOMMUNICATIONS PUBLIC LIMITED COMPANY,BRITISH TELECOMMUNICATIONS PUBLIC LIMITED COMPANY,,british telecommunications public limited company,british telecommunications public limited company,british telecommunications public limited,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3
238057,org_r8feDy6loObCsF87M7TK,British Telecommunications PLC,,British Telecommunications PLC,British Telecommunications PLC,,british telecommunications plc,british telecommunications plc,british telecommunications,british telecommunications,british telecommunications,british telecommunications,telecommunications,british telecommunications,2
268546,org_r8feDy6loObCsF87M7TK,British Telecommunications public company limited,,British Telecommunications public company limited,British Telecommunications public company limited,,british telecommunications public company limited,british telecommunications public company limited,british telecommunications public company,british telecommunications public,british telecommunications public,british telecommunications public,public,british telecommunications public,3


In [122]:
splt = ['sl','gmbh','companies','pvt','private','co','ag','ip','company','sp',
        'sa rl','et',
       'societa consortile per azioni','gmbh u','hb','et m','hk','incorp',
       'public limited company']

for each in splt:
    y = " "+ each + "$"
    df2['orglow7'] = df2['orglow7'].str.replace(y,"")

In [123]:
df2['orglow7'] = df2['orglow7'].str.replace("incorporated$","")
df2['orglow7'] = df2['orglow7'].str.replace("corporated$","")

In [124]:
########first#########

In [125]:
df2[df2.orglow7.str.endswith("technolgy")]

Unnamed: 0,assignee_id,organization,bractext,organization2,organization3,bractext1,orglow,orglow2,orglow3,orglow4,orglow5,orglow6,last,orglow7,orglen
469408,org_DMu8NH1j9MVIxDJBSQjQ,Massachusetts Institute of Technolgy,,Massachusetts Institute of Technolgy,Massachusetts Institute of Technolgy,,massachusetts institute of technolgy,massachusetts institute of technolgy,massachusetts institute of technolgy,massachusetts institute of technolgy,massachusetts institute of technolgy,massachusetts institute of technolgy,technolgy,massachusetts institute of technolgy,4
520114,org_m1wjmqxUyvyUHgBZoadC,Sasol Technolgy (Proprietary) Limited,Proprietary,Sasol Technolgy Limited,Sasol Technolgy Limited,,sasol technolgy limited,sasol technolgy limited,sasol technolgy,sasol technolgy,sasol technolgy,sasol technolgy,technolgy,sasol technolgy,2
528948,org_1XEPONiowauaN4VG5FNN,"Cisco Technolgy, Inc.",,"Cisco Technolgy, Inc.","Cisco Technolgy, Inc.",,"cisco technolgy, inc.",cisco technolgy inc,cisco technolgy,cisco technolgy,cisco technolgy,cisco technolgy,technolgy,cisco technolgy,2
547020,org_NTtD1DcRfYUaAKesU3t8,"Lavenir Technolgy, Inc.",,"Lavenir Technolgy, Inc.","Lavenir Technolgy, Inc.",,"lavenir technolgy, inc.",lavenir technolgy inc,lavenir technolgy,lavenir technolgy,lavenir technolgy,lavenir technolgy,technolgy,lavenir technolgy,2
584799,org_rnddWFedUWaEyFCXXTP3,Union Carbide Chemicals & Plastics Technolgy LLC,,Union Carbide Chemicals & Plastics Technolgy LLC,Union Carbide Chemicals & Plastics Technolgy LLC,,union carbide chemicals & plastics technolgy llc,union carbide chemicals & plastics technolgy llc,union carbide chemicals & plastics technolgy,union carbide chemicals & plastics technolgy,union carbide chemicals & plastics technolgy,union carbide chemicals plastics technolgy,technolgy,union carbide chemicals plastics technolgy,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6057680,org_0GC3A8lEiDv12UcxiylW,"Avid Technolgy, Inc.",,"Avid Technolgy, Inc.","Avid Technolgy, Inc.",,"avid technolgy, inc.",avid technolgy inc,avid technolgy,avid technolgy,avid technolgy,avid technolgy,technolgy,avid technolgy,2
6140461,org_5TjGMsLVqdRFjaIiUNoS,Microsoft Technolgy,,Microsoft Technolgy,Microsoft Technolgy,,microsoft technolgy,microsoft technolgy,microsoft technolgy,microsoft technolgy,microsoft technolgy,microsoft technolgy,technolgy,microsoft technolgy,2
6277130,org_189EkbiT2zT7L1wyvLxY,"Owens Corning Fiberglas Technolgy, Inc.",,"Owens Corning Fiberglas Technolgy, Inc.","Owens Corning Fiberglas Technolgy, Inc.",,"owens corning fiberglas technolgy, inc.",owens corning fiberglas technolgy inc,owens corning fiberglas technolgy,owens corning fiberglas technolgy,owens corning fiberglas technolgy,owens corning fiberglas technolgy,technolgy,owens corning fiberglas technolgy,4
6345328,org_hBWRkvtFeXFzoaFptnza,Wuxi Wio Technolgy Co.,,Wuxi Wio Technolgy Co.,Wuxi Wio Technolgy Co.,,wuxi wio technolgy co.,wuxi wio technolgy co,wuxi wio technolgy,wuxi wio technolgy,wuxi wio technolgy,wuxi wio technolgy,technolgy,wuxi wio technolgy,3


In [126]:
df2['orglow7'] = df2['orglow7'].str.replace("technolgy","technology")

In [127]:
df2['orglow7'].nunique() #435715

435716

In [128]:
df2['first'] = df2['orglow7'].apply(lambda x: x.split()[0])

In [129]:
cnt = df2['first'].value_counts()

In [130]:
cnt[(cnt>180)&(cnt<=200)]

solvay         200
northern       199
big            199
agfa           199
total          199
solar          197
health         195
imperial       194
xiamen         193
le             193
x              192
henkel         192
metal          192
mobile         191
sk             191
specialty      191
rhone          190
boehringer     189
deutsches      189
safety         189
nederlandse    189
computer       189
automated      188
star           188
brown          187
baker          186
dong           186
oxford         186
alfa           185
medtronic      184
showa          184
parker         184
gd             183
seiko          182
g              182
Name: first, dtype: int64

In [131]:
len(df2[df2['orglow7'].str.startswith("the")]) >0

True

In [132]:
lst = ['the', "kabushiki gaisha" , "kabushiki kaisha", "oy", "sa","nv",'aktiebolaget']

for i in lst:
    
    if len(df2[df2['orglow7'].str.startswith("the")])>0:
        print(i,len(df2[df2['orglow7'].str.startswith(i)]))
        y = '^' + i+" "
        df2['orglow7'] = df2['orglow7'].str.replace(y,"")

the 12693
kabushiki gaisha 13
kabushiki kaisha 2040
oy 362
sa 9154
nv 298
aktiebolaget 145


In [133]:
df2['orglow7'].isna().sum()

0

In [134]:
###join single characters
regex = re.compile('(?<![a-zA-Z0-9]{2})(?<=[a-zA-Z0-9]{1}) +(?=[a-zA-Z0-9] |.$)')
df2['orglow7'] = df2['orglow7'].str.replace(regex, "")
df2['orglow7'] = df2['orglow7'].apply(lambda x: x.strip())

In [135]:
df2['orglow7'].nunique() #432466

432728

In [136]:
df2.columns

Index(['assignee_id', 'organization', 'bractext', 'organization2',
       'organization3', 'bractext1', 'orglow', 'orglow2', 'orglow3', 'orglow4',
       'orglow5', 'orglow6', 'last', 'orglow7', 'orglen', 'first'],
      dtype='object')

In [137]:
del df2['first'], df2['orglen']

In [138]:
df2['orglow8'] = df2['orglow7']

In [139]:
df2['orglow8'] = df2['orglow8'].str.replace("telefonaktiebolaget","telephone")
df2['orglow8'] = df2['orglow8'].str.replace(r'(\bat\b\s*\bt\b)', 'att',regex=True)
df2['orglow8'] = df2['orglow8'].str.replace(" and "," ")
df2['orglow8'] = df2['orglow8'].str.replace(" and$","")
df2['orglow8'] = df2['orglow8'].str.replace(" labs$"," laboratories")
df2['orglow8'] = df2['orglow8'].str.replace(" labs "," laboratories ")
df2['orglow8'] = df2['orglow8'].str.replace(" lab$"," laboratory")
df2['orglow8'] = df2['orglow8'].str.replace(" lab "," laboratory ")
df2['orglow8'] = df2['orglow8'].str.replace(" tele communications$"," telecommunications")
df2['orglow8'] = df2['orglow8'].str.replace(" tele communications "," telecommunications ")
df2['orglow8'] = df2['orglow8'].str.replace(" telecom$"," telecommunications")
df2['orglow8'] = df2['orglow8'].str.replace(" bros$"," brothers")
df2['orglow8'] = df2['orglow8'].str.replace(" ill "," illinois ")
df2['orglow8'] = df2['orglow8'].apply(lambda x: x.strip())

In [140]:
df2['orglow8'] = df2['orglow8'].str.replace(" telecom$"," telecommunications")



In [141]:
df2[df2.orglow8.str.endswith(" telecom")]

Unnamed: 0,assignee_id,organization,bractext,organization2,organization3,bractext1,orglow,orglow2,orglow3,orglow4,orglow5,orglow6,last,orglow7,orglow8


In [142]:
df2.orglow8.nunique() #430290

430525

In [143]:
df2['orglow9'] = df2['orglow8']

In [144]:
# lst = ["8  8, Inc.", "8 x 8, Inc.", '8.times.8 Inc.',
#        '8.times.8, Inc.', '8.times.8, Inc.','8x8, Inc', '8x8, Inc.','88, Inc', '88, Inc.']

# df2.loc[df2['organization'].isin(lst),"orglow9"] = "8times8"

In [145]:
df2[df2.orglow9.str.contains("international tel")]

Unnamed: 0,assignee_id,organization,bractext,organization2,organization3,bractext1,orglow,orglow2,orglow3,orglow4,orglow5,orglow6,last,orglow7,orglow8,orglow9
285,org_U2TRh9WomWWWDaH5ybhG,International Telephone & Telegraph Corporation,,International Telephone & Telegraph Corporation,International Telephone & Telegraph Corporation,,international telephone & telegraph corporation,international telephone & telegraph corporation,international telephone & telegraph,international telephone & telegraph,international telephone & telegraph,international telephone telegraph,telegraph,international telephone telegraph,international telephone telegraph,international telephone telegraph
1640,org_U2TRh9WomWWWDaH5ybhG,International Telephone and Telegraph Corporation,,International Telephone and Telegraph Corporation,International Telephone and Telegraph Corporation,,international telephone and telegraph corporation,international telephone and telegraph corporation,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,telegraph,international telephone and telegraph,international telephone telegraph,international telephone telegraph
16821,org_qEk3LlB6IHFyTEuk6fZa,"International Teletext Communications, Inc.",,"International Teletext Communications, Inc.","International Teletext Communications, Inc.",,"international teletext communications, inc.",international teletext communications inc,international teletext communications,international teletext communications,international teletext communications,international teletext communications,communications,international teletext communications,international teletext communications,international teletext communications
66015,org_xfDxYDHkMr76Jk43Pn0f,International Telecommunication Corp.,,International Telecommunication Corp.,International Telecommunication Corp.,,international telecommunication corp.,international telecommunication corp,international telecommunication,international telecommunication,international telecommunication,international telecommunication,telecommunication,international telecommunication,international telecommunication,international telecommunication
89140,org_uk6w4xkwHSR1q7LAPoYr,International Teleservice Corporation,,International Teleservice Corporation,International Teleservice Corporation,,international teleservice corporation,international teleservice corporation,international teleservice,international teleservice,international teleservice,international teleservice,teleservice,international teleservice,international teleservice,international teleservice
95522,org_U2TRh9WomWWWDaH5ybhG,International Telephone and Telegraph Corp.,,International Telephone and Telegraph Corp.,International Telephone and Telegraph Corp.,,international telephone and telegraph corp.,international telephone and telegraph corp,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,telegraph,international telephone and telegraph,international telephone telegraph,international telephone telegraph
212768,org_U2TRh9WomWWWDaH5ybhG,International Telephone & Telegraph Corp.,,International Telephone & Telegraph Corp.,International Telephone & Telegraph Corp.,,international telephone & telegraph corp.,international telephone & telegraph corp,international telephone & telegraph,international telephone & telegraph,international telephone & telegraph,international telephone telegraph,telegraph,international telephone telegraph,international telephone telegraph,international telephone telegraph
227458,org_U2TRh9WomWWWDaH5ybhG,International Telephone and Telegraph,,International Telephone and Telegraph,International Telephone and Telegraph,,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,international telephone and telegraph,telegraph,international telephone and telegraph,international telephone telegraph,international telephone telegraph
291822,org_CXOQeXWZBbtSZa7UbbCW,International Telesystems Corp.,,International Telesystems Corp.,International Telesystems Corp.,,international telesystems corp.,international telesystems corp,international telesystems,international telesystems,international telesystems,international telesystems,telesystems,international telesystems,international telesystems,international telesystems
413734,org_U2TRh9WomWWWDaH5ybhG,International Telephone & Telegraph Corporation,,International Telephone & Telegraph Corporation,International Telephone & Telegraph Corporation,,international telephone & telegraph corporation,international telephone & telegraph corporation,international telephone & telegraph,international telephone & telegraph,international telephone & telegraph,international telephone telegraph,telegraph,international telephone telegraph,international telephone telegraph,international telephone telegraph


In [146]:
df2.loc[df2['orglow9'].str.contains("communications satellite"), "orglow9"] = "comsat"
df2.loc[df2['orglow9'].str.contains("general refractories"), "orglow9"] = "grefco"
df2.loc[df2['orglow9'].str.contains("american colloid"), "orglow9"] = "amcol"
df2.loc[df2['orglow9'].str.contains("aluminum company of america"), "orglow9"] = "alcoa"
df2.loc[df2['orglow9'].str.contains("minnesota min"), "orglow9"] = "3m"
df2.loc[df2['orglow9'].str.contains("3m innovative"), "orglow9"] = "3m"
df2['orglow9'] = df2['orglow9'].str.replace(" comm$"," communications")
# df2['orglow9'] = df2['orglow9'].str.replace("united states", "us")
df2.loc[df2['organization'].str.contains("Tampa Electric Company"), "orglow9"]= "teco"
df2['orglow9'] = df2['orglow9'].str.replace("america online", "aol")
df2['orglow9'] = df2['orglow9'].str.replace("atlantic richfield", "arco")
df2['orglow9'] = df2['orglow9'].str.replace("advanced technology material", "atmi")
df2.loc[df2['organization']=="Bell Canada","orglow9"] = "bce"
df2.loc[df2['organization']=="Bucklee-Mears Company","orglow9"] = "bmc"
df2.loc[df2['orglow9'].str.contains("federal national mortgage"), "orglow9"] = "fannie mae"
df2.loc[df2['orglow9'].str.contains("bt group"), "orglow9"] = "british telecommunications group"
df2.loc[df2['orglow9'].str.contains("crutcher resources"), "orglow9"] = "crc"
df2.loc[df2['orglow9'].str.contains("ecolab"), "orglow9"] = "economics laboratory"
df2.loc[df2['orglow9'].str.contains("educational computer"), "orglow9"] = "ecc"
df2.loc[df2['orglow9'].str.contains("electromagnetic sciences"), "orglow9"] = "ems"
df2.loc[df2['orglow9'].str.contains(" nec "), "orglow9"] = "nippon electric"
df2.loc[df2['orglow9'].str.contains("^nec "), "orglow9"] = "nippon electric"
df2.loc[df2['orglow9'].str.contains(" nec$"), "orglow9"] = "nippon electric"
df2.loc[df2['orglow9'].str.contains("^nec$"), "orglow9"] = "nippon electric"

df2.loc[df2['orglow9'].str.contains("netapp"), "orglow9"] = "network appliance"
df2.loc[df2['orglow9'].str.contains("nortel"), "orglow9"] = "northern telecommunication"
df2.loc[df2['orglow9'].str.contains("sgs thomson microele"), "orglow9"] = "stmicroelectronics"
df2.loc[df2['orglow9'].str.contains("southwest securities"), "orglow9"] = "sws"
df2.loc[df2['orglow9'].str.contains("tdk electronics"), "orglow9"] = "tokyo denki kagaku"
df2.loc[df2['orglow9'].str.contains("tesco"), "orglow9"] = "transnational energy systems"
df2.loc[df2['orglow9'].str.contains("ual corp"), "orglow9"] = "united airlines"
df2.loc[df2['orglow9'].str.contains("usg corp"), "orglow9"] = "united states gypsum"
df2.loc[df2['orglow9'].str.contains("^usf "), "orglow9"] = "usfreightways"
df2.loc[df2['orglow9'].str.contains(" usf$"), "orglow9"] = "usfreightways"
df2.loc[df2['orglow9'].str.contains(" usf "), "orglow9"] = "usfreightways"
df2.loc[df2['organization'].str.contains("Stec "), "orglow9"] = "simpletech"
df2.loc[df2['organization'].str.contains("NNA/S"), "orglow9"] = "novo nordisk"
df2.loc[df2['orglow9'].str.contains("fedex"), "orglow9"] = "federal express"
df2.loc[df2['organization'].str.contains("NMS Communications Corporation"), "orglow9"] = "natural microsystems"
df2['orglow9'] = df2['orglow9'].str.replace("gatx","general american transporation")
df2.loc[df2['organization'].str.contains("ITT Corporation"), "orglow9"] = "international telephone telegraph"
df2.loc[df2['orglow9'].str.contains("wabtec corp "), "orglow9"] = "westinghouse air brake technologies"

df2.loc[df2['orglow9'].str.contains("rhone poulenc"), "orglow9"] = "rhone poulenc"
df2.loc[df2['organization'].str.contains("IT&T Industries, Inc."), "orglow9"] = "international telephone telegraph"

df2.loc[df2['orglow9'].str.contains("alcatel"), "orglow9"] = "alcatel"






In [147]:
df2.loc[df2['organization']=="Bell Canada","orglow9"] = "bce"

In [148]:
# del df2['last']

In [149]:
df2.nunique()

assignee_id      436099
organization     677775
bractext           4316
organization2    675757
organization3    675748
bractext1            16
orglow           601225
orglow2          504437
orglow3          459542
orglow4          443846
orglow5          441534
orglow6          439606
last             136108
orglow7          432728
orglow8          430525
orglow9          429996
dtype: int64

In [150]:
df2.orglow9.nunique() #430358

429996

In [151]:
########join for patents########

In [152]:
df_corp = df_corp.drop_duplicates(keep = 'first')
df_corp['organization'] = df_corp['organization'].str.encode('ascii', 'ignore').str.decode('ascii')


In [153]:
pv = pd.merge(df_corp, df2, left_on = ["assignee_id","organization"], 
              right_on = ["assignee_id","organization"], 
              how = "left")

In [154]:
pv.nunique() #430358

patent_id        6070872
assignee_id       436099
type                   2
organization      677775
sequence              16
bractext            4316
organization2     675757
organization3     675748
bractext1             16
orglow            601225
orglow2           504437
orglow3           459542
orglow4           443846
orglow5           441534
orglow6           439606
last              136108
orglow7           432728
orglow8           430525
orglow9           429996
dtype: int64

In [155]:
pv.isna().sum()

patent_id        0
assignee_id      0
type             0
organization     0
sequence         0
bractext         0
organization2    0
organization3    0
bractext1        0
orglow           0
orglow2          0
orglow3          0
orglow4          0
orglow5          0
orglow6          0
last             0
orglow7          0
orglow8          0
orglow9          0
dtype: int64

In [156]:
pv['orglow9'] = pv['orglow9'].str.replace("wm wrigley", "william wrigley")

In [157]:
#############################

In [158]:
pv.columns

Index(['patent_id', 'assignee_id', 'type', 'organization', 'sequence',
       'bractext', 'organization2', 'organization3', 'bractext1', 'orglow',
       'orglow2', 'orglow3', 'orglow4', 'orglow5', 'orglow6', 'last',
       'orglow7', 'orglow8', 'orglow9'],
      dtype='object')

In [159]:
pv[pv['organization'].str.contains("AT&T")]

Unnamed: 0,patent_id,assignee_id,type,organization,sequence,bractext,organization2,organization3,bractext1,orglow,orglow2,orglow3,orglow4,orglow5,orglow6,last,orglow7,orglow8,orglow9
231,9565216,org_pSsLdPF4NrnYVGt5DrRI,2,"AT&T INTELLECTUAL PROPERTY I, L.P.",0,,"AT&T INTELLECTUAL PROPERTY I, L.P.","AT&T INTELLECTUAL PROPERTY I, L.P.",,"at&t intellectual property i, l.p.",at&t intellectual property i lp,at&t intellectual property i,at&t intellectual property i,at&t intellectual property i,at t intellectual property i,i,at t intellectual property i,att intellectual property i,att intellectual property i
266,8631163,org_pSsLdPF4NrnYVGt5DrRI,2,"AT&T Intellectual Property II, L.P.",0,,"AT&T Intellectual Property II, L.P.","AT&T Intellectual Property II, L.P.",,"at&t intellectual property ii, l.p.",at&t intellectual property ii lp,at&t intellectual property ii,at&t intellectual property ii,at&t intellectual property ii,at t intellectual property ii,ii,at t intellectual property ii,att intellectual property ii,att intellectual property ii
312,4635263,org_V1tOidwicuoWA736DE99,2,AT&T Bell Laboratories,0,,AT&T Bell Laboratories,AT&T Bell Laboratories,,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at t bell laboratories,laboratories,at t bell laboratories,att bell laboratories,att bell laboratories
867,5257339,org_V1tOidwicuoWA736DE99,2,AT&T Bell Laboratories,0,,AT&T Bell Laboratories,AT&T Bell Laboratories,,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at&t bell laboratories,at t bell laboratories,laboratories,at t bell laboratories,att bell laboratories,att bell laboratories
1377,9503684,org_pSsLdPF4NrnYVGt5DrRI,2,"AT&T Intellectual Property II, L.P.",0,,"AT&T Intellectual Property II, L.P.","AT&T Intellectual Property II, L.P.",,"at&t intellectual property ii, l.p.",at&t intellectual property ii lp,at&t intellectual property ii,at&t intellectual property ii,at&t intellectual property ii,at t intellectual property ii,ii,at t intellectual property ii,att intellectual property ii,att intellectual property ii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6257103,7990848,org_pSsLdPF4NrnYVGt5DrRI,2,"AT&T Intellectual Property II, L.P.",0,,"AT&T Intellectual Property II, L.P.","AT&T Intellectual Property II, L.P.",,"at&t intellectual property ii, l.p.",at&t intellectual property ii lp,at&t intellectual property ii,at&t intellectual property ii,at&t intellectual property ii,at t intellectual property ii,ii,at t intellectual property ii,att intellectual property ii,att intellectual property ii
6257364,9848433,org_cqRqlQk0xbkxrNakdlEB,2,AT&T Mobility II LLC,1,,AT&T Mobility II LLC,AT&T Mobility II LLC,,at&t mobility ii llc,at&t mobility ii llc,at&t mobility ii,at&t mobility ii,at&t mobility ii,at t mobility ii,ii,at t mobility ii,att mobility ii,att mobility ii
6257673,7715412,org_tBdNUkbblSIviAl1Vcj9,2,AT&T Corp.,0,,AT&T Corp.,AT&T Corp.,,at&t corp.,at&t corp,at&t,at&t,at&t,at t,t,at t,att,att
6258093,9781184,org_cqRqlQk0xbkxrNakdlEB,2,AT&T MOBILITY II LLC,0,,AT&T MOBILITY II LLC,AT&T MOBILITY II LLC,,at&t mobility ii llc,at&t mobility ii llc,at&t mobility ii,at&t mobility ii,at&t mobility ii,at t mobility ii,ii,at t mobility ii,att mobility ii,att mobility ii


In [160]:
pv['orglow9'] = pv['orglow9'].str.replace("united states","us")


In [163]:
pv= pv[~(pv['orglow9']=="86")]

In [164]:
pv.to_csv("rawpv_clean_26Apr2020.csv", index = False)