# Program to clean CRSP data 
 Input files:



 - CRSP_Names_Data_Monthly.txt
 
 
 Output files: 


 - crsp_clean_23Apr20.csv


In [1]:
import pandas as pd
import re
from cleanco import cleanco
from fuzzywuzzy import fuzz 
import numpy as np   
import datetime

In [2]:
now = datetime.datetime.now()

today =  (now.strftime("%d%b%Y"))
today

'28Apr2020'

# CRSP data

In [3]:
crsp = pd.read_csv('CRSP_Names_Data_Monthly.txt', sep = '\t')

In [4]:
crsp.head()

Unnamed: 0,PERMNO,date,SHRCD,EXCHCD,TICKER,COMNAM,SHRCLS,PERMCO,RET
0,10000,19851231,,,,,,7952,
1,10000,19860131,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,A,7952,C
2,10000,19860228,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,A,7952,-0.257143
3,10000,19860331,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,A,7952,0.365385
4,10000,19860430,10.0,3.0,OMFGA,OPTIMUM MANUFACTURING INC,A,7952,-0.098592


In [5]:
crsp.nunique()

PERMNO     32985
date        1117
SHRCD         26
EXCHCD        17
TICKER     30616
COMNAM     40493
SHRCLS        16
PERMCO     29481
RET       432156
dtype: int64

In [6]:
del crsp['SHRCD'],crsp['EXCHCD'],crsp['TICKER'],crsp['SHRCLS'], crsp['RET']

# preparing crsp data

In [7]:
crsp['date'] =crsp['date'].apply(lambda x : str(x))
crsp['date'] = crsp['date'].apply(lambda x : x[:4]+"-"+x[4:6]+"-"+x[6:])

In [8]:
crsp['date']

0          1985-12-31
1          1986-01-31
2          1986-02-28
3          1986-03-31
4          1986-04-30
              ...    
4514425    2018-08-31
4514426    2018-09-28
4514427    2018-10-31
4514428    2018-11-30
4514429    2018-12-31
Name: date, Length: 4514430, dtype: object

In [9]:
crsp['date'] = pd.to_datetime(crsp['date'],format="%Y-%m-%d")

In [10]:
crsp['crsp_year'] = crsp['date'].dt.year

In [11]:
crsp['crsp_year'].sort_values().unique()

array([1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935,
       1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946,
       1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957,
       1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968,
       1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979,
       1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990,
       1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
       2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
       2013, 2014, 2015, 2016, 2017, 2018])

In [12]:
crsp.isna().sum() #31257 comnam are null

PERMNO           0
date             0
COMNAM       31257
PERMCO           0
crsp_year        0
dtype: int64

In [13]:
# remove na
crsp2 = crsp[~crsp['COMNAM'].isna()] #40493 comnam unique

In [14]:
crsp2.nunique() #40493

PERMNO       32985
date          1117
COMNAM       40493
PERMCO       29481
crsp_year       94
dtype: int64

In [15]:
crsp2.isna().sum()

PERMNO       0
date         0
COMNAM       0
PERMCO       0
crsp_year    0
dtype: int64

In [16]:
del crsp2['date']

In [17]:
crsp2['crsp_year'] = crsp2['crsp_year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
# select only crsp values from 1976-? 
crsp2 = crsp2[~crsp2['crsp_year'].isna()]


In [19]:
crsp2.nunique() #36656

PERMNO       32985
COMNAM       40493
PERMCO       29481
crsp_year       94
dtype: int64

In [20]:
len(crsp2)

4483173

In [21]:
crsp2 = crsp2.drop_duplicates(keep = "first")

In [22]:
len(crsp2) #413474

413474

In [23]:
##select only years after 1975

crsp2 = crsp2[crsp2['crsp_year']>1975]

In [24]:
len(crsp2) #333162

333162

In [25]:
##remove spaces between single characters. e.g. A C F INDUSTRIES == ACF INDUSTRIES
regex = re.compile('(?<![a-zA-Z0-9]{2})(?<=[a-zA-Z0-9]{1}) +(?=[a-zA-Z0-9] |.$)')

In [26]:
crsp2['COMNAM2'] = crsp2['COMNAM'].str.replace(regex, "")

In [27]:
crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM MUN INC FD'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM MUN INC FD'),'COMNAM2'].str.replace(' PREM MUN INC FD'," PREMIUM MUNICIPAL INCOME FUND")
crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM INC RLTY FD'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM INC RLTY FD'),'COMNAM2'].str.replace(' PREM INC RLTY FD'," PREMIUM INCOME REALTY FUND")
crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM INC MUN FD'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' PREM INC MUN FD'),'COMNAM2'].str.replace(' PREM INC MUN FD'," PREMIUM INCOME MUNICIPAL FUND")

crsp2.loc[crsp2['COMNAM2'].str.contains(' DIV INC '),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' DIV INC'),'COMNAM2'].str.replace(' DIV INC'," DIVISION INCOME")
crsp2.loc[crsp2['COMNAM2'].str.contains(' PWR & LT'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' PWR & LT'),'COMNAM2'].str.replace(' PWR & LT'," POWER & LIGHT")
crsp2.loc[crsp2['COMNAM2'].str.contains(' POWER & LT'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' POWER & LT'),'COMNAM2'].str.replace(' POWER & LT'," POWER & LIGHT")

crsp2.loc[crsp2['COMNAM2'].str.contains(' SVGS '),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' SVGS '),'COMNAM2'].str.replace(' SVGS '," SAVINGS ")

crsp2.loc[crsp2['COMNAM2'].str.contains(' PR DIV FD '),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' PR DIV FD '),'COMNAM2'].str.replace(' PR DIV FD '," PREMIUM DIVIDEND FUND ")

crsp2.loc[crsp2['COMNAM2'].str.contains(' MUN INC '),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' MUN INC '),'COMNAM2'].str.replace(' MUN INC '," MUNICIPAL INCOME ")

crsp2.loc[crsp2['COMNAM2'].str.contains(' INFTN-LNKD OP&IN F'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' INFTN-LNKD OP&IN F'),'COMNAM2'].str.replace(' INFTN-LNKD OP&IN F'," INFLATION-LINKED OPTIONS & INCOME FUND")

crsp2.loc[crsp2['COMNAM2'].str.contains(' INFLTN-LNKD '),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' INFLTN-LNKD '),'COMNAM2'].str.replace(' INFLTN-LNKD '," INFLATION-LINKED ")

crsp2.loc[crsp2['COMNAM2'].str.contains(' INC FD'),'COMNAM2'] =crsp2.loc[crsp2['COMNAM2'].str.contains(' INC FD'),'COMNAM2'].str.replace(' INC FD'," INCOME FUND")



In [28]:
crsp2.nunique() #36639 COMNAM2

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      36639
dtype: int64

In [29]:
crsp2['last'] = crsp2['COMNAM2'].apply(lambda x: x.split()[-1])
l = crsp2['last'].value_counts() 


In [30]:
# l[(l>10)&(l<=15)].index

In [31]:
# TR = trust, FD = fund; split after DE; AMER = america; ASSN = association; INVS = investments; 
# mktg = marketing; rlty = realty; SYS = systems; PPTYS = properties; natl = national; assoc = associates
# svcs = services; labs = laboratories; assn = association; invt = investment;
# HLDGS = holdings; secs = securities; wks = works; incm = income; rr = railroad; PTNR = partner
# hldg = holding; brdrs = breeders ; bk = bank

In [32]:
##last
# 'INC', 'CORP', 'CO', 'LTD', 'NEW', 'PLC', 'LP',
#        , 'SA', 'COMPANY', 'DEL', 'DE', 'IN',
#        'NV', 'SAB DE CV', 'SHS', 'LIMITED''LLC','SPA',TECHNOLOGY', 'TECHNOLOGIES','AS','COR'
#      'INTERNATIONAL','FSB','AB','COMPANIES','AB','THE','INTL','COS','AKTIEBOLAGET,'INCORPORATED',
## 'AKTIENGESELLSCHAFT','AKTIEBOLAG',''


In [33]:
#CT : "GREENWICH CT","NEW HAVEN CT", "HRFD CT","PK CT","WETHERSFIELD CT","BRISTOL CT","TORRINGTON CT",
#  "NEW BRIT CT","WALLINGFORD CT","ASONIA CT"
#MA : "NORTH QUINCY MA","SOMERVILLE MA","BRIGHTON MA","LEOM MA","NWT MA","WORCESTER MA","NRTHMPT MA"
        #"BROCKTON MA", "BOSTON MA", "LOWELL MA","BROCKTON MA","MILFORD MA","MA"
#MICH: TROY MICH, DETROIT MICH, MICH
#VA: "VIENNA VA","RCH VA","RICHMOND VA","STAUNTON VA","ALEX VA","W VA","WEST VA,VA"
#NC: "CATAWBA NC","CHR NC","HICKORY NC","SALIS NC", "GASTONIA NC", "RALEIGH NC", "NC"
#OH : "CINCINNATTI OH","NORTHERN OH", "MIDDLETOWN OH", "XENIA OH", "TOLEDO OH", "OH"
#GA : "SOUTHEAST GA", "GA"
#TX: "HOUSTON TX","HOUS TX", "WACO TX", "DALLAS TX","TX"
#NJ: "S RIVER NJ","HAMM NJ","VINELAND NJ","NEPTUNE NJ","PARAMUS NJ","NEWARK NJ","SDLE BRK NJ","PLAINFIELD NJ",
        #"LINWOOD NJ", "CHERRY HILL NJ","NJ"
#WA: BELLEVUE WA, "TACOMA WA","WA"
#Mass: "QUINCY MASS", "HOLYOKE MASS","CONCORD MASS"
#MN: "MINTKA MN","MN"
#TN: "DICKSON TN", "COLUMBIA TN","CHATT TN","TULL TN","TN"
#NY: "ALBANY NY","BROOKLYN NY", "MIDDLETOWN NY","HMPSTEAD NY","NY"
#LA: "CALIF LA","LA"; FIRST INTERSTATE BANK OF S LA = retain
#AL:"DECATUR AL","MONTGOMERY AL","AL"
#NH: "DOVER NH","NASHUA NH","PETERBOROUGH NH"; PUBLIC SERVICE CO OF NH - retain
#NE: "OMAHA NE","NE"
#WI: "MILWAUKEE WI","WI" 
#AZ: "PHOENIX AZ","AZ"
#WV: "CHARLES WV","WV"
#MT : "RCK MT","ROCKEY MT","MT"
#IN: "IN"
#CA: "SAN FRANCISCO CA","PORTERVILLE CA", "IRVINE CA", "SAN FRAN CA", "SHERMAN OAKS CA","ANAHEIM CA" 
 #       "TUSTIN CA", "SAN JOSE CA", "SOUTHN CA", "GLE CA", "VISTA CA", "SHERMAN OAKS CA","IRVINE CA",
        #"SANTA ANA CA","PORTERVILLE CA", "CA"
#CALIF:  SAN JOSE CALIF, CALIF,
#WIS: NEW WIS, WIS
#OKLA: OKLA
#NL : NL
#ME ; PORTLAND ME, ME
#TENN: TENN
#CONN: NEW BRITAIN CONN, HEW HAVEN CONN, CONN
#FL : MIAMI FL, CHARL FL, PERRY FL, FT LAUD FL, DEF FL,BRKSVL FL, FL
#OHIO: OHIO
#FLA: FLA
#MINN: MINN
#ME : ME
#NEV: NEV
#WY: WY
#BERMUDA: BERMUDA
#DC: WASHINGTON DC, WASH DC, DC
#OR: COOS BAY OR, OR
#ILL : ILL
#QUEBEC: QUEBEC
#QUE: 
#CANADA: CANADA
#USA: BOSTON USA, USA
#BOSTON: BOSTON
#BOSTN: BOSTN
#NEW ORLEANS: NEW ORLEANS
#NEW ZEALAND: NEW ZEALAND
#OK: OK
#AK: AK
#S AFRICA: S AFRICA
#WASH SEATTLE: WASH SEATTLE, PUGET SOUND SEATTLE, SEATTLE
#NORTH AMERICA: NORTH AMERICA; change amer to america and then if of america or of amer - don't
#CALIFORNIA:CALIFORNIA
#CONN WATERBURY: CONN WATERBURY
#BRIDGEPORT: BRIDGEPORT
#ARIZ: ARIZ
#KANSAS: KANSAS
#PENNSYLVANIA, PENNSYLVANIA
#JAPAN: JAPAN
#ISRAEL TEL AVIV: ISRAEL TEL AVIV
#SANTA BARBARA: SANTA BARBAR
#IRELAND: IRELAND
#IOWA: IOWA
#UT: UT
#SAN FRAN: SAN FRAN
#MT: ROCKEY MT, RCK MT, MT
#ATLANTA: GA ATLANTA,  ATLANTA
#CAL: CAL
#MO: LEBANON MO, MO; of MO
#NM: CLOVIS NM, NM
#COLO, FA, MS, TAMPA

In [34]:
##standardize words
crsp2['COMNAM2'] = crsp2['COMNAM2'].str.replace(" AMER$"," AMERICA")
crsp2['COMNAM2'] = crsp2['COMNAM2'].str.replace(" AMER "," AMERICA ")

In [35]:
crsp2[crsp2['COMNAM'].str.startswith("UNITED STATES")]

Unnamed: 0,PERMNO,COMNAM,PERMCO,crsp_year,COMNAM2,last
36717,10215,UNITED STATES VIDEO VENDING CORP,8052,1986,UNITED STATES VIDEO VENDING CORP,CORP
36725,10215,UNITED STATES VIDEO VENDING CORP,8052,1987,UNITED STATES VIDEO VENDING CORP,CORP
128208,10803,UNITED STATES FACILITIES CORP,8696,1986,UNITED STATES FACILITIES CORP,CORP
128211,10803,UNITED STATES FACILITIES CORP,8696,1987,UNITED STATES FACILITIES CORP,CORP
128223,10803,UNITED STATES FACILITIES CORP,8696,1988,UNITED STATES FACILITIES CORP,CORP
...,...,...,...,...,...,...
4513569,93425,UNITED STATES BRENT OIL FUND LP,53442,2014,UNITED STATES BRENT OIL FUND LP,LP
4513581,93425,UNITED STATES BRENT OIL FUND LP,53442,2015,UNITED STATES BRENT OIL FUND LP,LP
4513593,93425,UNITED STATES BRENT OIL FUND LP,53442,2016,UNITED STATES BRENT OIL FUND LP,LP
4513605,93425,UNITED STATES BRENT OIL FUND LP,53442,2017,UNITED STATES BRENT OIL FUND LP,LP


In [36]:
# PUBLIC SERVICE CO OF NH - retain; amer to america and don't change if of america

In [37]:
## remove geographic terms at end of company names

def geoend(column, col):
    
    geo = ["NORTH QUINCY MA","SOMERVILLE MA","BRIGHTON MA","LEOM MA","NWT MA","WORCESTER MA","NRTHMPT MA"
            "BROCKTON MA", "BOSTON MA", "LOWELL MA","BROCKTON MA","MILFORD MA","MA", "VIENNA VA","RCH VA",
           "RICHMOND VA","STAUNTON VA","ALEX VA","W VA","VA","CATAWBA NC","CHR NC","HICKORY NC","SALIS NC", 
           "GASTONIA NC", "RALEIGH NC", "NC","CINCINNATTI OH","NORTHERN OH", "MIDDLETOWN OH", "XENIA OH", 
           "TOLEDO OH", "OH", "SOUTHEAST GA", "GA","HOUSTON TX","HOUS TX", "WACO TX", "DALLAS TX","TX",
           "S RIVER NJ","HAMM NJ","VINELAND NJ","NEPTUNE NJ","PARAMUS NJ","NEWARK NJ","SDLE BRK NJ","PLAINFIELD NJ",
            "LINWOOD NJ", "CHERRY HILL NJ","NJ","BELLEVUE WA", "TACOMA WA","WA","QUINCY MASS", "HOLYOKE MASS",
           "CONCORD MASS","MINTKA MN","MN","DICKSON TN", "COLUMBIA TN","CHATT TN","TULL TN","TN","ALBANY NY",
           "BROOKLYN NY", "MIDDLETOWN NY","HMPSTEAD NY","NY","CALIF LA","LA", "DECATUR AL","MONTGOMERY AL","AL",
           "DOVER NH","NASHUA NH","PETERBOROUGH NH","NH", "OMAHA NE","NE","MILWAUKEE WI","WI" , "PHOENIX AZ","AZ",
           "CHARLES WV","WV","RCK MT","ROCKEY MT","MT","IN", "SAN FRANCISCO CA","PORTERVILLE CA", "IRVINE CA", 
           "SAN FRAN CA", "SHERMAN OAKS CA","ANAHEIM CA" , "TUSTIN CA", "SAN JOSE CA", "SOUTHN CA", "GLE CA", 
           "VISTA CA", "SHERMAN OAKS CA","IRVINE CA","SANTA ANA CA","PORTERVILLE CA","CA", "SAN JOSE CALIF", "CALIF",
           "NEW WIS", "WIS", "OKLA", "NL", "PORTLAND ME", "ME", "TENN", "NEW BRITAIN CONN", "HEW HAVEN CONN", "CONN", 
           "MIAMI FL", "CHARL FL", "PERRY FL", "FT LAUD FL", "DEF FL","BRKSVL FL", "FL", "OHIO", "FLA", "MINN", "ME", 
           "NEV", "WY", "BERMUDA", "WASHINGTON DC","WASH DC", "DC", "COOS BAY OR", "OR", "ILL", "QUEBEC", "CANADA", 
           "BOSTON USA", "USA", "BOSTON", "BOSTN", "NEW ORLEANS","NEW ZEALAND",
           "OK", "AK", "S AFRICA", "WASH SEATTLE", "PUGET SOUND SEATTLE", "SEATTLE,NORTH AMERICA", "CALIFORNIA", 
           "CONN WATERBURY","BRIDGEPORT", "ARIZ", "KANSAS", "PENNSYLVANIA", "JAPAN", "ISRAEL TEL AVIV", 
           "SANTA BARBAR", "IRELAND", "IOWA", "UT", "SAN FRAN",
          "ROCKEY MT", "RCK MT", "MT", "GA ATLANTA",  "ATLANTA","WEST VA","CAL",
          "TROY MICH", "DETROIT MICH", "MICH","LEBANON MO", "MO","CLOVIS NM","NM","N A LOS ANGELES","N Y",
          "NWPT KY", "ELIZABH KY", "KY","GREENWICH CT","NEW HAVEN CT", "HRFD CT","PK CT","WETHERSFIELD CT",
           "BRISTOL CT","TORRINGTON CT", "NEW BRIT CT","WALLINGFORD CT","ASONIA CT", "CT", "IND", 
           "STHPLD MI","MI","VA BCH", "GA WINDER","DEL",'COLO','FA','MS','TAMPA','NORTH AMERICA','AMERICA','NORTH AMERICAN']

    geo2 = []
    for i in geo:
            geo2.append(" " +i)


    for each in geo2:
            if (len(crsp2[crsp2[column].str.endswith(each)]) >0 ):

                    print(each,len(crsp2[crsp2[column].str.endswith(each)]) )

                    y = each+"$"
                    crsp2.loc[(crsp2[column].str.endswith(each))&(~(crsp2[column].str.endswith("of"+ each)))
                                                                  &(~(crsp2[column].str.endswith("of the"+ each))) ,
                              col] =  crsp2.loc[(crsp2[column].str.endswith(each))&(~(crsp2[column].str.endswith("of"+ each)))
                                                                  &(~(crsp2[column].str.endswith("of the"+ each))) ,column].str.replace(y,"")

                    crsp2[column] = crsp2[column].apply(lambda x : x.strip())
                    


In [38]:
geoend('COMNAM2','COMNAM2')

 NORTH QUINCY MA 6
 SOMERVILLE MA 3
 BRIGHTON MA 5
 LEOM MA 4
 NWT MA 7
 WORCESTER MA 3
 BOSTON MA 12
 LOWELL MA 5
 BROCKTON MA 11
 MILFORD MA 5
 MA 241
 VIENNA VA 5
 RCH VA 6
 RICHMOND VA 2
 STAUNTON VA 5
 ALEX VA 6
 W VA 4
 VA 271
 CATAWBA NC 15
 CHR NC 5
 HICKORY NC 1
 SALIS NC 1
 GASTONIA NC 1
 RALEIGH NC 7
 NC 220
 CINCINNATTI OH 6
 NORTHERN OH 4
 MIDDLETOWN OH 4
 XENIA OH 4
 TOLEDO OH 5
 OH 253
 SOUTHEAST GA 8
 GA 243
 HOUSTON TX 12
 HOUS TX 4
 WACO TX 2
 DALLAS TX 12
 TX 219
 S RIVER NJ 5
 HAMM NJ 3
 VINELAND NJ 2
 NEPTUNE NJ 3
 PARAMUS NJ 5
 NEWARK NJ 10
 SDLE BRK NJ 5
 PLAINFIELD NJ 13
 LINWOOD NJ 4
 CHERRY HILL NJ 5
 NJ 280
 BELLEVUE WA 23
 TACOMA WA 5
 WA 186
 QUINCY MASS 7
 HOLYOKE MASS 4
 CONCORD MASS 9
 MINTKA MN 11
 MN 94
 DICKSON TN 2
 COLUMBIA TN 5
 CHATT TN 10
 TULL TN 6
 TN 78
 ALBANY NY 15
 BROOKLYN NY 5
 MIDDLETOWN NY 5
 HMPSTEAD NY 1
 NY 545
 CALIF LA 3
 LA 53
 DECATUR AL 6
 MONTGOMERY AL 3
 AL 39
 DOVER NH 6
 NASHUA NH 5
 PETERBOROUGH NH 3
 NH 30
 OMAHA NE 5
 NE 

In [39]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
last         0
dtype: int64

In [40]:
geoend('COMNAM2','COMNAM2')

 LA 14


In [41]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
last         0
dtype: int64

In [42]:
##last
# 'INC', 'CORP', 'CO', 'LTD', 'NEW', 'PLC', 'LP',
#        , 'SA', 'COMPANY', 'DEL', 'DE', 'IN',
#        'NV', 'SAB DE CV', 'SHS', 'LIMITED''LLC','SPA',TECHNOLOGY', 'TECHNOLOGIES','AS','COR'
#      'INTERNATIONAL','FSB','AB','COMPANIES','AB','THE','INTL','COS','AKTIEBOLAGET,'INCORPORATED',
## 'AKTIENGESELLSCHAFT','AKTIEBOLAG',' " societa per azioni", oyj, oy,asa
#split after co

In [43]:
#check at end of comp name
def clean(column, col):

    compend = ['AB', 'AKTIEBOLAG', 'AKTIEBOLAGET', 'AKTIENGESELLSCHAFT','AS',
            'CO', 'COMPANIES','COMPANY','COR','CORP', 'COS',
               'DE', 'DEL', 'FSB', 'IN','INC', 'INCORPORATED','INTERNATIONAL','INTL',
             'LIMITEDLLC','LP','LTD','NEW','NV','PLC','SA','SAB DE CV','SHS','SPA','THE','AND',
               'ASA','OY','OYJ']

    compend2 = []
    for i in compend:
        compend2.append(" " +i)
        

    for each in compend2:
        if (len(crsp2[crsp2[column].str.endswith(each)]) >0 ):
            
                print(each,len(crsp2[crsp2[column].str.endswith(each)]) )
                
                y = each+"$"
                crsp2.loc[crsp2[column].str.endswith(each),col] = crsp2.loc[crsp2[column].str.
                                                                                  endswith(each),column].str.replace(y,"")

                crsp2[column] = crsp2[column].apply(lambda x : x.strip())

In [44]:
crsp2.nunique() #36408

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      36408
last          1030
dtype: int64

In [45]:
clean('COMNAM2', 'COMNAM2')

 AB 127
 AKTIEBOLAG 18
 AKTIEBOLAGET 31
 AKTIENGESELLSCHAFT 27
 AS 208
 CO 17561
 COMPANIES 137
 COMPANY 1564
 COR 257
 CORP 74359
 COS 111
 DE 1468
 FSB 311
 IN 5
 INC 156960
 INCORPORATED 35
 INTERNATIONAL 5462
 INTL 1778
 LP 2427
 LTD 13183
 NEW 5886
 NV 1054
 PLC 2560
 SA 1764
 SAB DE CV 116
 SHS 607
 SPA 228
 THE 119
 ASA 52
 OY 5
 OYJ 2


In [46]:
crsp2.nunique() #35782

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35781
last          1030
dtype: int64

In [47]:
geoend('COMNAM2','COMNAM2')

 VA 60
 NC 27
 GA 7
 TX 7
 NJ 14
 WA 49
 NY 104
 LA 19
 AL 8
 NH 2
 NE 6
 AZ 1
 CA 32
 CALIF 9
 WIS 13
 OKLA 5
 TENN 22
 FL 51
 OHIO 38
 FLA 25
 BERMUDA 23
 DC 12
 OR 11
 ILL 41
 CANADA 127
 USA 329
 BOSTON 73
 CALIFORNIA 27
 PENNSYLVANIA 24
 JAPAN 27
 IOWA 58
 ATLANTA 45
 MO 3
 KY 4
 IND 75
 DEL 124
 COLO 17
 NORTH AMERICA 54
 AMERICA 1018


In [48]:
geoend('COMNAM2','COMNAM2')

In [49]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
last         0
dtype: int64

In [50]:
crsp2.nunique() #COMNAM2 35714

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35713
last          1030
dtype: int64

In [51]:
clean('COMNAM2','COMNAM2')

 AB 10
 CO 3294
 COMPANIES 783
 COMPANY 335
 COR 4
 CORP 3154
 COS 663
 DE 46
 FSB 4
 INC 2647
 INTERNATIONAL 990
 INTL 230
 LP 4
 LTD 183
 AND 6


In [52]:
crsp2.nunique() #COMNAM2 35101

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35100
last          1030
dtype: int64

In [53]:
clean('COMNAM2','COMNAM2')

 CO 43
 COMPANIES 38
 COMPANY 15
 CORP 32
 COS 9
 INTERNATIONAL 12
 INTL 9


In [54]:
clean('COMNAM2','COMNAM2')

In [55]:
crsp2.nunique() #35080 COMNAM2

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35080
last          1030
dtype: int64

In [56]:
geoend('COMNAM2','COMNAM2')

 MA 16
 NC 20
 USA 1


In [57]:
crsp2.nunique() #35078 COMNAM2

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35078
last          1030
dtype: int64

In [58]:
# convert to lower case
crsp2['comlow'] = crsp2['COMNAM2'].apply(lambda x : x.lower())

In [59]:
crsp2.nunique() #35078 comlow

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35078
last          1030
comlow       35078
dtype: int64

In [60]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
last         0
comlow       0
dtype: int64

In [61]:
# TR = trust, FD = fund; split after DE; AMER = america; ASSN = association; INVS = investments; 
# mktg = marketing; rlty = realty; SYS = systems; PPTYS = properties; natl = national; assoc = associates
# svcs = services; labs = laboratories; assn = association; invt = investment;
# HLDGS = holdings; secs = securities; wks = works; incm = income; rr = railroad; PTNR = partner
# hldg = holding; brdrs = breeders ; bk = bank
#drdg = dredgeing
# gd = gold; bldrs = builders

In [62]:
# split 

splt = [ ' company ', ' llc ', ' ltd ', 
       ' limited ',  ' unlimited ', ' corporation ', ' corp ',' plc ',
         "lat am"," n am"," ag "," co ",'societa per azioni']



inclst = ['sc', 'nw', 'il', 'mass', 'pa', 'utah','n mex', 'north','n j','ca','md',
'intl sign svc', 'mi', 'cl a', 'madison', 'idaho','honolu hi', 'tm', 'port','wash', 'nevada',
          'cayman isl',
'kan','tex', 'sc', 'florida', 'bldr','vg', "del", "cl a", "j",  "n"]

splt2 = []

for each in inclst: 
    
    splt2.append("inc "+each)

splt = splt + splt2
    

for each in splt:
    if (len(crsp2[crsp2['comlow'].str.contains(each)])>0): 
            print(each,len(crsp2[crsp2['comlow'].str.contains(each)]) )
            crsp2['comlow2'] = crsp2['comlow'].apply(lambda x : x.split(each)[0])

 company  50
 ltd  227
 limited  43
 unlimited  7
 corporation  14
 corp  1163
 plc  3
lat am 2
 n am 11
 ag  32
 co  471
societa per azioni 9
inc sc 38
inc nw 10
inc il 27
inc mass 36
inc pa 182
inc utah 3
inc n mex 12
inc ca 32
inc md 137
inc intl sign svc 8
inc mi 2
inc cl a 21
inc madison 2
inc idaho 9
inc honolu hi 5
inc tm 26
inc port 207
inc wash 5
inc nevada 3
inc cayman isl 28
inc kan 5
inc tex 7
inc sc 38
inc florida 6
inc bldr 12
inc vg 7
inc cl a 21
inc j 5
inc n 55


In [63]:
crsp2['comlow2'] = crsp2['comlow2'].apply(lambda x : x.strip())

In [64]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
last         0
comlow       0
comlow2      0
dtype: int64

In [65]:
crsp2['comlow2'].nunique() #35076

35075

In [66]:
del crsp2['last']

In [67]:
crsp2['last'] = crsp2['comlow2'].apply(lambda x: x.split()[-1])

In [68]:
cnt = crsp2['last'].value_counts()

In [69]:
# cnt[(cnt>45)&(cnt<=50)]

In [70]:
#inds, sys, &. svcs, labs, ins, assn, hldgs,prods = products, bk = bank; invs, grp = group
#mfg = manufacturing, dev, assoc,secs, pptys,svc, mng, invt , hldg, mgmt = management, ctrs = , ptnrs = 
#rr = railroad, lab,pwr,srvcs,incm = income,instrs,cntrs; finl = financial; techs = technologies; res = resources
##educ = education, gp = group; expl = exploration; gas lt = light; else remove lt; resh = research;
#engr = engineering; utils = utilities, mach = machine; sec = securities; inv = investments, rlty = realty; 
# ln = loan; gr = group


# remove: pa, limited, ag, llc,albany,texas, of,"sa de cv",cp,md,"new york",sc,mass,midland,pr,us,inc cl a; 
#inc j,  inc n,
# split corp,co  

In [71]:
# split 

splt = [ ' corp ', " co ", " del ", " ltd "," ag "]
for each in splt:
    if (len(crsp2[crsp2['comlow2'].str.contains(each)])>0): 
            print(each,len(crsp2[crsp2['comlow2'].str.contains(each)]) )
            crsp2['comlow2'] = crsp2['comlow2'].apply(lambda x : x.split(each)[0])

 corp  1163
 co  471
 del  123
 ltd  212
 ag  32


In [72]:
crsp2['len'] = crsp2['comlow2'].apply(lambda x: len(x.split()))

In [73]:
## remove ends:

comend = [ "limited", "ag", "llc","albany","texas", "of","sa de cv","cp","md","new york","sc","mass",'associates',
          "pr","us", "&","invs", "inv", "invt", "invs","assoc", "assn", "inc","unlimited", "assc",'association']

for each in comend:
        if (len(crsp2[(crsp2['comlow2'].str.endswith(each))&(crsp2['len']>1)]) >0 ):


            y = " "+each+"$"
            print(each,len(crsp2[(crsp2['comlow2'].str.endswith(each))&(crsp2['len']>1)]) )


            crsp2.loc[(crsp2['comlow2'].str.endswith(each))&(crsp2['len']>1),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.
                                                                              endswith(each)&(crsp2['len']>1),'comlow2'].str.replace(y,"")

            crsp2['comlow2'] = crsp2['comlow2'].apply(lambda x : x.strip())


limited 474
ag 499
llc 249
albany 23
texas 49
of 498
sa de cv 333
cp 328
md 210
new york 129
sc 186
mass 72
associates 396
pr 76
us 246
& 1651
invs 474
inv 73
invt 203
assoc 379
assn 693
inc 213
unlimited 33
assc 17
association 62


In [74]:
crsp2.isna().sum()

PERMNO       0
COMNAM       0
PERMCO       0
crsp_year    0
COMNAM2      0
comlow       0
comlow2      0
last         0
len          0
dtype: int64

In [75]:
crsp2.nunique() #34880

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35078
comlow       35078
comlow2      34880
last         11044
len              9
dtype: int64

In [76]:
crsp2[crsp2['comlow2'].str.contains(" dev ")]

Unnamed: 0,PERMNO,COMNAM,PERMCO,crsp_year,COMNAM2,comlow,comlow2,last,len
1101268,22940,AMERICAN COMMUNITY DEV GRP INC,905,1986,AMERICAN COMMUNITY DEV GRP,american community dev grp,american community dev grp,grp,4
1101271,22940,AMERICAN COMMUNITY DEV GRP INC,905,1987,AMERICAN COMMUNITY DEV GRP,american community dev grp,american community dev grp,grp,4
2580914,68128,SAXON OIL DEV PARTNERS L P,21568,1985,SAXON OIL DEV PARTNERS,saxon oil dev partners,saxon oil dev partners,partners,4
2580918,68128,SAXON OIL DEV PARTNERS L P,21568,1986,SAXON OIL DEV PARTNERS,saxon oil dev partners,saxon oil dev partners,partners,4
2580930,68128,SAXON OIL DEV PARTNERS L P,21568,1987,SAXON OIL DEV PARTNERS,saxon oil dev partners,saxon oil dev partners,partners,4
2580942,68128,SAXON OIL DEV PARTNERS L P,21568,1988,SAXON OIL DEV PARTNERS,saxon oil dev partners,saxon oil dev partners,partners,4
2580954,68128,SAXON OIL DEV PARTNERS L P,21568,1989,SAXON OIL DEV PARTNERS,saxon oil dev partners,saxon oil dev partners,partners,4
3699735,84540,HEURISTIC DEV GROUP INC,15328,1997,HEURISTIC DEV GROUP,heuristic dev group,heuristic dev group,group,3
3699746,84540,HEURISTIC DEV GROUP INC,15328,1998,HEURISTIC DEV GROUP,heuristic dev group,heuristic dev group,group,3
3699758,84540,HEURISTIC DEV GROUP INC,15328,1999,HEURISTIC DEV GROUP,heuristic dev group,heuristic dev group,group,3


In [77]:
#inds, sys, &. svcs, ins, assn, hldgs,prods = products, bk = bank; invs, grp = group
#mfg = manufacturing, dev, assoc,secs, pptys,svc, mng, invt , hldg, mgmt = management, ctrs = , ptnrs = 
#rr = railroad, lab,pwr,srvcs,incm = income,instrs,cntrs; finl = financial; techs = technologies; res = resources
##educ = education, gp = group; expl = exploration; gas lt = light; else remove lt; resh = research;
#engr = engineering; utils = utilities, mach = machine; sec = securities; inv = investments, rlty = realty; 
# mls = mills; ln = loan; gr = group; instrs = instruments
# TR = trust, FD = fund; split after DE; AMER = america; ASSN = association; INVS = investments; 
# mktg = marketing; PPTYS = properties; natl = national; assoc = associates
# secs = securities; wks = works;  PTNR = partner,  hldg = holding; brdrs = breeders ; bk = bank
#drdg = dredgeing, gd = gold; bldrs = builders; dev = development; ctrs = centers; cntrs = centers; 
#instrs = instrumnets; bkshares = bancshares; bks = bank; eqp = equipment; brd = baord, mfrs = manufactures; dev development
#indl = industrial

crsp2.loc[crsp2['comlow2'].str.endswith(' equp'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' equp'),'comlow2'].str.replace(" equp$"," equipment")
crsp2.loc[crsp2['comlow2'].str.endswith(' brd'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' brd'),'comlow2'].str.replace(" brd$"," board")
crsp2.loc[crsp2['comlow2'].str.endswith(' inds'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' inds'),'comlow2'].str.replace(" inds$"," industries")
crsp2.loc[crsp2['comlow2'].str.endswith(' indl'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' indl'),'comlow2'].str.replace(" indl$"," industrial")
crsp2.loc[crsp2['comlow2'].str.endswith(' dev'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' dev'),'comlow2'].str.replace(" dev$"," development")

crsp2.loc[crsp2['comlow2'].str.endswith(' sys'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' sys'),'comlow2'].str.replace(" sys$","  systems")
crsp2.loc[crsp2['comlow2'].str.endswith(' svc'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' svc'),'comlow2'].str.replace(" svc$","  services")
crsp2.loc[crsp2['comlow2'].str.endswith(' svcs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' svcs'),'comlow2'].str.replace(" svcs$","  services")
crsp2.loc[crsp2['comlow2'].str.endswith(' ins'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ins'),'comlow2'].str.replace(" ins$"," insurance")
crsp2.loc[crsp2['comlow2'].str.endswith(' hdg'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' hdg'),'comlow2'].str.replace(" hdg$"," holding")
crsp2.loc[crsp2['comlow2'].str.endswith(' hldg'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' hldg'),'comlow2'].str.replace(" hldg$"," holding")
crsp2.loc[crsp2['comlow2'].str.endswith(' hldgs'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.endswith(' hldgs'),'comlow2'].str.replace(" hldgs$"," holdings")
crsp2.loc[crsp2['comlow2'].str.endswith(' prods'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' prods'),'comlow2'].str.replace(" prods$","  products")
crsp2.loc[crsp2['comlow2'].str.endswith(' bk'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' bk'),'comlow2'].str.replace(" bk$"," bank")
crsp2.loc[crsp2['comlow2'].str.endswith(' bks'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' bks'),'comlow2'].str.replace(" bks$"," bank")
crsp2.loc[crsp2['comlow2'].str.endswith(' bros'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' bros'),'comlow2'].str.replace(" bros$"," brothers")

crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'].str.replace(" sec$"," securities")

crsp2.loc[crsp2['comlow2'].str.endswith(' deutschland'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' deutschland'),'comlow2'].str.replace(" deutschland$","")




crsp2.loc[crsp2['comlow2'].str.endswith(' bkshares'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' bkshares'),'comlow2'].str.replace(" bkshares$"," bancshares")
crsp2.loc[crsp2['comlow2'].str.endswith(' grp'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' grp'),'comlow2'].str.replace(" grp$"," group")
crsp2.loc[crsp2['comlow2'].str.endswith(' mfrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mfrs'),'comlow2'].str.replace(" mfrs$"," manufacturers")
crsp2.loc[crsp2['comlow2'].str.endswith(' mfg'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mfg'),'comlow2'].str.replace(" mfg$"," manufacturing")
crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'].str.replace(" sec$"," securities")
crsp2.loc[crsp2['comlow2'].str.endswith(' secs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' secs'),'comlow2'].str.replace(" secs$"," securities")
crsp2.loc[crsp2['comlow2'].str.endswith(' pptys'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' pptys'),'comlow2'].str.replace(" pptys$"," properties")
crsp2.loc[crsp2['comlow2'].str.endswith(' mng'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mng'),'comlow2'].str.replace(" mng$"," mining")
crsp2.loc[crsp2['comlow2'].str.endswith(' mgmt'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mgmt'),'comlow2'].str.replace(" mgmt$"," management")
crsp2.loc[crsp2['comlow2'].str.endswith(' ptnr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ptnr'),'comlow2'].str.replace(" ptnr$"," partner")
crsp2.loc[crsp2['comlow2'].str.endswith(' ptnrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ptnrs'),'comlow2'].str.replace(" ptnrs$"," partners")
crsp2.loc[crsp2['comlow2'].str.endswith(' rr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' rr'),'comlow2'].str.replace(" rr$"," railroad")
crsp2.loc[crsp2['comlow2'].str.endswith(' pwr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' pwr'),'comlow2'].str.replace(" pwr$"," power")
crsp2.loc[crsp2['comlow2'].str.endswith(' srvcs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' srvcs'),'comlow2'].str.replace(" srvcs$"," services")
crsp2.loc[crsp2['comlow2'].str.endswith(' incm'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' incm'),'comlow2'].str.replace(" incm$"," income")
crsp2.loc[crsp2['comlow2'].str.endswith(' finl'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' finl'),'comlow2'].str.replace(" finl$"," financial")
crsp2.loc[crsp2['comlow2'].str.endswith(' techs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' techs'),'comlow2'].str.replace(" techs$"," technologies")
crsp2.loc[crsp2['comlow2'].str.endswith(' res'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' res'),'comlow2'].str.replace(" res$"," resources")
crsp2.loc[crsp2['comlow2'].str.endswith(' gp'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' gp'),'comlow2'].str.replace(" gp$"," group")
crsp2.loc[crsp2['comlow2'].str.endswith(' expl'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' expl'),'comlow2'].str.replace(" expl$"," exploration")
crsp2.loc[crsp2['comlow2'].str.endswith(' gas lt'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' gas lt'),'comlow2'].str.replace(" gas lt$"," gas light")
crsp2.loc[crsp2['comlow2'].str.endswith(' resh'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' resh'),'comlow2'].str.replace(" resh$"," research")
crsp2.loc[crsp2['comlow2'].str.endswith(' engr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' engr'),'comlow2'].str.replace(" engr$"," engineering")
crsp2.loc[crsp2['comlow2'].str.endswith(' utils'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' utils'),'comlow2'].str.replace(" utils$"," utilities")
crsp2.loc[crsp2['comlow2'].str.endswith(' mach'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mach'),'comlow2'].str.replace(" mach$"," machine")
crsp2.loc[crsp2['comlow2'].str.endswith(' machs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' machs'),'comlow2'].str.replace(" machs$"," machines")
crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' sec'),'comlow2'].str.replace(" sec$"," securities")
crsp2.loc[crsp2['comlow2'].str.endswith(' rlty'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' rlty'),'comlow2'].str.replace(" rlty$"," realty")
crsp2.loc[crsp2['comlow2'].str.endswith(' mls'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mls'),'comlow2'].str.replace(" mls$"," mills")
crsp2.loc[crsp2['comlow2'].str.endswith(' ln'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ln'),'comlow2'].str.replace(" ln$"," loan")
crsp2.loc[crsp2['comlow2'].str.endswith(' gr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' gr'),'comlow2'].str.replace(" gr$"," group")
crsp2.loc[crsp2['comlow2'].str.endswith(' tr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' tr'),'comlow2'].str.replace(" tr$"," trust")
crsp2.loc[crsp2['comlow2'].str.endswith(' fd'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' fd'),'comlow2'].str.replace(" fd$"," fund")
crsp2.loc[crsp2['comlow2'].str.endswith(' fnd'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' fnd'),'comlow2'].str.replace(" fnd$"," fund")
crsp2.loc[crsp2['comlow2'].str.endswith(' fun'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' fun'),'comlow2'].str.replace(" fun$"," fund")
crsp2.loc[crsp2['comlow2'].str.endswith(' assn'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' assn'),'comlow2'].str.replace(" assn$"," association")
crsp2.loc[crsp2['comlow2'].str.endswith(' mktg'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' mktg'),'comlow2'].str.replace(" mktg$"," marketing")
crsp2.loc[crsp2['comlow2'].str.endswith(' natl'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' natl'),'comlow2'].str.replace(" natl$"," national")
crsp2.loc[crsp2['comlow2'].str.endswith(' assoc'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' assoc'),'comlow2'].str.replace(" assoc$"," association")
crsp2.loc[crsp2['comlow2'].str.endswith(' wks'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' wks'),'comlow2'].str.replace(" wks$"," works")
crsp2.loc[crsp2['comlow2'].str.endswith(' brdrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' brdrs'),'comlow2'].str.replace(" brdrs$"," breeders")
crsp2.loc[crsp2['comlow2'].str.endswith(' drdg'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' drdg'),'comlow2'].str.replace(" drdg$"," dredging")
crsp2.loc[crsp2['comlow2'].str.endswith(' bldrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' bldrs'),'comlow2'].str.replace(" bldrs$"," builders")
crsp2.loc[crsp2['comlow2'].str.endswith(' dev'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' dev'),'comlow2'].str.replace(" dev$"," development")
crsp2.loc[crsp2['comlow2'].str.endswith(' instrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' instrs'),'comlow2'].str.replace(" instrs$"," instruments")
crsp2.loc[crsp2['comlow2'].str.endswith(' instr'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' instr'),'comlow2'].str.replace(" instr$"," instrument")
crsp2.loc[crsp2['comlow2'].str.endswith(' cntrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' cntrs'),'comlow2'].str.replace(" cntrs$"," centers")
crsp2.loc[crsp2['comlow2'].str.endswith(' ctrs'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ctrs'),'comlow2'].str.replace(" ctrs$"," centers")
crsp2.loc[crsp2['comlow2'].str.endswith(' ctls'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' ctls'),'comlow2'].str.replace(" ctls$"," controls")
crsp2.loc[crsp2['comlow2'].str.endswith(' matl'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' matl'),'comlow2'].str.replace(" matl$"," material")
crsp2.loc[crsp2['comlow2'].str.endswith(' chem'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' chem'),'comlow2'].str.replace(" chem$"," chemicals")
crsp2.loc[crsp2['comlow2'].str.endswith(' airls'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' airls'),'comlow2'].str.replace(" airls$"," airlines")
crsp2.loc[crsp2['comlow2'].str.endswith(' comms'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' comms'),'comlow2'].str.replace(" comms$"," communications")
crsp2.loc[crsp2['comlow2'].str.endswith(' prem'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' prem'),'comlow2'].str.replace(" prem$"," premier")
crsp2.loc[crsp2['comlow2'].str.endswith(' entmt'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' entmt'),'comlow2'].str.replace(" entmt$"," entertainment")
crsp2.loc[crsp2['comlow2'].str.endswith(' teleg'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' teleg'),'comlow2'].str.replace(" teleg$"," telegraph")
crsp2.loc[crsp2['comlow2'].str.endswith(" hrdwr"),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(" hrdwr"),'comlow2'].str.replace(" hrdwr"," hardware")
crsp2.loc[crsp2['comlow2'].str.endswith(" telecom"),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(" telecom"),'comlow2'].str.replace(" telecom$"," telecommunications")
crsp2['comlow2'] = crsp2['comlow2'].str.replace("tele communications", "telecommunications")
crsp2['comlow2'] = crsp2['comlow2'].str.replace("tele communication", "telecommunication")
crsp2['comlow2'] = crsp2['comlow2'].str.replace(" telecom$", " telecommunication")

crsp2['comlow2'] = crsp2['comlow2'].str.replace(" telecom ", " telecommunication ")

crsp2['comlow2'] = crsp2['comlow2'].str.replace(" comm$"," communications")


In [78]:
crsp2['comlow2'].nunique() #34829

34829

In [79]:
crsp2.loc[crsp2['comlow2'].str.contains(' sys '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains (' sys '),'comlow2'].str.replace(" sys "," systems ")
crsp2.loc[crsp2['comlow2'].str.contains(' svcs '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' svcs '),'comlow2'].str.replace(" svcs ","  services ")
crsp2.loc[crsp2['comlow2'].str.contains(' svc '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' svc '),'comlow2'].str.replace(" svc ","  service ")

crsp2.loc[crsp2['comlow2'].str.contains(' prods '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' prods '),'comlow2'].str.replace(" prods ","  products ")
crsp2.loc[crsp2['comlow2'].str.contains(' ins '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ins '),'comlow2'].str.replace(" ins "," insurance ")
crsp2.loc[crsp2['comlow2'].str.contains(' hldgs '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' hldgs '),'comlow2'].str.replace(" hldgs "," holdings ")
crsp2.loc[crsp2['comlow2'].str.contains(' hldg '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' hldg '),'comlow2'].str.replace(" hldg "," holding ")

crsp2.loc[crsp2['comlow2'].str.contains(' hdg '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' hdg '),'comlow2'].str.replace(" hdg "," holding ")
crsp2.loc[crsp2['comlow2'].str.contains(' bk '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' bk '),'comlow2'].str.replace(" bk "," bank ")
crsp2.loc[crsp2['comlow2'].str.contains(' bks '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' bks '),'comlow2'].str.replace(" bks "," bank ")

crsp2.loc[crsp2['comlow2'].str.contains(' invs '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' invs '),'comlow2'].str.replace(" invs "," investors ")
crsp2.loc[crsp2['comlow2'].str.contains(' grp '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' grp '),'comlow2'].str.replace(" grp "," group ")
crsp2.loc[crsp2['comlow2'].str.contains(' finl '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' finl '),'comlow2'].str.replace(" finl "," financial ")
crsp2.loc[crsp2['comlow2'].str.contains(' assn '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' assn '),'comlow2'].str.replace(" assn "," association ")
crsp2.loc[crsp2['comlow2'].str.contains(' assoc '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' assoc '),'comlow2'].str.replace(" assoc "," association ")
crsp2.loc[crsp2['comlow2'].str.contains(' res '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' res '),'comlow2'].str.replace(" res "," resources ")
crsp2.loc[crsp2['comlow2'].str.contains(' mfg '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' mfg '),'comlow2'].str.replace(" mfg "," manufacturing ")
crsp2.loc[crsp2['comlow2'].str.contains(' pptys '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' pptys '),'comlow2'].str.replace(" pptys "," properties ")
crsp2.loc[crsp2['comlow2'].str.contains(' mng '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' mng '),'comlow2'].str.replace(" mng "," mining ")

crsp2.loc[crsp2['comlow2'].str.contains(' invt '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' invt '),'comlow2'].str.replace(" invt "," investment ")
crsp2.loc[crsp2['comlow2'].str.contains(' invts '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' invts '),'comlow2'].str.replace(" invts "," investments ")
crsp2.loc[crsp2['comlow2'].str.contains(' mgmt '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' mgmt '),'comlow2'].str.replace(" mgmt "," management ")
crsp2.loc[crsp2['comlow2'].str.contains(' equip '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' equip '),'comlow2'].str.replace(" equip "," equipment ")
crsp2.loc[crsp2['comlow2'].str.contains(' natl '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' natl '),'comlow2'].str.replace(" natl "," national ")
crsp2.loc[crsp2['comlow2'].str.contains(' engr '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' engr '),'comlow2'].str.replace(" engr "," engineering ")
crsp2.loc[crsp2['comlow2'].str.contains(' entmt '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' entmt '),'comlow2'].str.replace(" entmt "," entertainment ")
crsp2.loc[crsp2['comlow2'].str.contains(' bk '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' bk '),'comlow2'].str.replace(" bk "," bank ")
crsp2.loc[crsp2['comlow2'].str.contains(' mach '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' mach '),'comlow2'].str.replace(" mach "," machinery ")
crsp2.loc[crsp2['comlow2'].str.contains(' machs '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' machs '),'comlow2'].str.replace(" machs "," machines ")
crsp2.loc[crsp2['comlow2'].str.contains(' instr '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' instr '),'comlow2'].str.replace(" instr "," instrument ")
crsp2.loc[crsp2['comlow2'].str.contains(' ptnrs '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ptnrs '),'comlow2'].str.replace(" ptnrs "," partners ")
crsp2.loc[crsp2['comlow2'].str.contains(' tel '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' tel '),'comlow2'].str.replace(" tel "," telephone ")
crsp2.loc[crsp2['comlow2'].str.contains(' teleg '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' teleg '),'comlow2'].str.replace(" teleg "," telegraph ")
crsp2.loc[crsp2['comlow2'].str.contains(' educ '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' educ '),'comlow2'].str.replace(" educ "," education ")
crsp2.loc[crsp2['comlow2'].str.contains(' fd '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' fd '),'comlow2'].str.replace(" fd "," fund ")
crsp2.loc[crsp2['comlow2'].str.contains(' univ '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' univ '),'comlow2'].str.replace(" univ "," university ")
crsp2.loc[crsp2['comlow2'].str.contains(' trgt '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' trgt '),'comlow2'].str.replace(" trgt "," target ")
crsp2.loc[crsp2['comlow2'].str.contains(' grth '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' grth '),'comlow2'].str.replace(" grth "," growth ")
crsp2.loc[crsp2['comlow2'].str.contains(' pub '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' pub '),'comlow2'].str.replace(" pub "," pubilc ")
crsp2.loc[crsp2['comlow2'].str.contains(' muni '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' muni '),'comlow2'].str.replace(" muni "," municipal ")
crsp2.loc[crsp2['comlow2'].str.contains(' pfd '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' pfd '),'comlow2'].str.replace(" pfd "," preferred ")
crsp2.loc[crsp2['comlow2'].str.contains(' tr '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' tr '),'comlow2'].str.replace(" tr "," trust ")
crsp2.loc[crsp2['comlow2'].str.contains(' airls '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' airls '),'comlow2'].str.replace(" airls "," airlines ")
crsp2.loc[crsp2['comlow2'].str.contains(' comms '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' comms '),'comlow2'].str.replace(" comms "," communications ")
crsp2.loc[crsp2['comlow2'].str.contains(' grw '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' grw '),'comlow2'].str.replace(" grw "," growth ")
crsp2.loc[crsp2['comlow2'].str.contains(' eq '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' eq '),'comlow2'].str.replace(" eq "," equity ")
crsp2.loc[crsp2['comlow2'].str.endswith(' eq'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.endswith(' eq'),'comlow2'].str.replace(" eq$"," equity")
crsp2.loc[crsp2['comlow2'].str.contains(' ctls '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ctls '),'comlow2'].str.replace(" ctls "," controls ")
crsp2.loc[crsp2['comlow2'].str.contains(' ptnr '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ptnr '),'comlow2'].str.replace(" ptnr "," partner ")
crsp2.loc[crsp2['comlow2'].str.contains(' ln '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ln '),'comlow2'].str.replace(" ln "," loan ")
crsp2.loc[crsp2['comlow2'].str.contains(' incm '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' incm '),'comlow2'].str.replace(" incm "," income ")
crsp2.loc[crsp2['comlow2'].str.contains(' ind '),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ind '),'comlow2'].str.replace(" ind "," industries ")
crsp2.loc[crsp2['comlow2'].str.contains(' ltg & elec'),'comlow2'] = crsp2.loc[crsp2['comlow2'].str.contains(' ltg & elec'),'comlow2'].str.replace(" ltg & elec$"," lighting & electricity")
crsp2['comlow2'] = crsp2['comlow2'].str.replace(" indl ", " industrial ")
crsp2.loc[crsp2['comlow2'].str.contains(' dev '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' dev '),'comlow2'].str.replace(" dev "," development ")
crsp2.loc[crsp2['comlow2'].str.contains(' telecom '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' telecom '),'comlow2'].str.replace(" telecom "," telecommunications ")
crsp2.loc[crsp2['comlow2'].str.contains(' intl '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' intl '),'comlow2'].str.replace(" intl "," international ")
crsp2.loc[crsp2['comlow2'].str.contains(' ill '),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' ill '),'comlow2'].str.replace(" ill "," illinois ")
crsp2.loc[crsp2['comlow2'].str.contains(' intl$'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains(' intl$'),'comlow2'].str.replace(" intl$"," international ")
crsp2.loc[crsp2['comlow2'].str.contains('^intl'),'comlow2'] =crsp2.loc[crsp2['comlow2'].str.contains('^intl '),'comlow2'].str.replace("^intl "," international ")
crsp2['comlow2'] = crsp2['comlow2'].str.replace("aktiebolaget","")

In [80]:
## strip
crsp2['comlow2'] = crsp2['comlow2'].apply(lambda x : x.strip())

In [81]:
crsp2.nunique() #34819

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35078
comlow       35078
comlow2      34819
last         11044
len              9
dtype: int64

In [82]:
crsp2['comlow2'] = crsp2['comlow2'].str.replace(' +'," ")

In [83]:
crsp2.nunique() #comlow2 34806

PERMNO       30624
COMNAM       36656
PERMCO       27195
crsp_year       43
COMNAM2      35078
comlow       35078
comlow2      34806
last         11044
len              9
dtype: int64

In [84]:
crsp2['comlow3'] = crsp2['comlow2'].replace('\.',' ', regex=True)
crsp2['comlow3'] = crsp2['comlow3'].replace('-',' ', regex=True)
crsp2['comlow3'] = crsp2['comlow3'].replace(',',' ', regex=True)
crsp2['comlow3'] = crsp2['comlow3'].replace('&'," ", regex=True)
crsp2['comlow3'] = crsp2['comlow3'].replace(' and '," ", regex=True)
crsp2['comlow3'] = crsp2['comlow3'].apply(lambda x: x.strip())

In [85]:
crsp2.comlow3.nunique() #34806

34806

In [86]:
regex = re.compile('(?<![a-zA-Z0-9]{2})(?<=[a-zA-Z0-9]{1}) +(?=[a-zA-Z0-9] |.$)')
crsp2['comlow3'] = crsp2['comlow3'].str.replace(regex, "")

In [87]:
#remove space between at t to get att
crsp2['comlow3'] = crsp2['comlow3'].replace(r'(\bat\b\s*\bt\b)', 'att',regex=True)

In [88]:
crsp2['comlow3'] = crsp2['comlow3'].apply(lambda x: x.strip())
crsp2['comlen'] = crsp2['comlow3'].apply(lambda x: len(x))

In [89]:
crsp2[crsp2['comlen']== 1]["comlow3"].unique()

array(['c', 'm'], dtype=object)

In [90]:
crsp2.loc[crsp2['comlow3']=="c"][['COMNAM','comlow2','comlow']] ##way to join  (cor, as)

Unnamed: 0,COMNAM,comlow2,comlow
951003,C COR INC,c,c
951008,C COR INC,c,c
951020,C COR INC,c,c
951032,C COR INC,c,c


In [91]:
crsp2[crsp2['COMNAM'].str.contains(" COR ")]['comlow3'].unique()

array(['c cor net', 'c', 'hi cor resources'], dtype=object)

In [92]:
# array(['C COR NET CORP', 'C COR INC', 'HI COR RESOURCES LTD'],
#       dtype=object)

In [93]:
crsp2.loc[crsp2['COMNAM']=="C COR INC","comlow3"] = "c cor"

In [94]:
crsp2['last2'] = crsp2['comlow3'].apply(lambda x : x.split()[-1])

lst = crsp2['last2'].value_counts()

In [95]:
lst[lst>1000]

trust              20095
group              10657
industries         10380
fund               10298
systems             7465
bancorp             7166
financial           6096
holdings            5380
technologies        4887
services            3817
resources           3816
energy              3689
bank                2855
communications      2821
technology          2657
products            2366
ii                  2325
pharmaceuticals     2291
bancshares          1976
funds               1787
partners            1765
holding             1675
capital             1659
enterprises         1618
gas                 1591
stores              1589
petroleum           1408
properties          1386
electronics         1334
oil                 1329
foods               1274
national            1261
manufacturing       1138
insurance           1129
medical             1059
software            1046
Name: last2, dtype: int64

In [96]:
crsp2['comlow4'] = crsp2['comlow3']

In [97]:
crsp2[crsp2['COMNAM'].str.contains(" USF ")]

Unnamed: 0,PERMNO,COMNAM,PERMCO,crsp_year,COMNAM2,comlow,comlow2,last,len,comlow3,comlen,last2,comlow4


In [98]:
# join tele communications

In [99]:
crsp2.loc[crsp2['COMNAM']=="COMMUNICATIONS SATELLITE CORP", "comlow4"] = "comsat"
crsp2.loc[crsp2['COMNAM']=="GENERAL REFRACTORIES CO", "comlow4"] = "grefco"
crsp2.loc[crsp2['COMNAM']=="AMERICAN COLLOID CO", "comlow4"] = "amcol"
crsp2.loc[crsp2['COMNAM']=="ALUMINUM COMPANY AMER", "comlow4"] = "alcoa"
crsp2.loc[crsp2['comlow4'].str.contains("minnesota min"), "comlow4"] = "3m"
crsp2['comlow4'] = crsp2['comlow4'].str.replace(" comm$"," communications")
crsp2.loc[crsp2['COMNAM'].str.contains("TAMPA ELECTRIC CO"), "comlow4"]= "teco"
crsp2.loc[crsp2['COMNAM']=="AMERICA ONLINE INC DEL", "comlow4"] = "aol"
crsp2.loc[crsp2['COMNAM']=="ATLANTIC RICHFIELD CO", "comlow4"] = "arco"
crsp2.loc[crsp2['COMNAM']=="ADVANCED TECHNOLOGY MATERIAL INC", "comlow4"] = "atmi"
crsp2.loc[crsp2['COMNAM']=="BELL CANADA ENTERPRISES", "comlow4"] = "bce"
# crsp2.loc[crsp2['comlow4'].str.contains("8x8"), "comlow4"] = "8times8"
crsp2.loc[crsp2['comlow4'].str.contains("federal national mortgage"), "comlow4"] = "fannie mae"
# crsp2['comlow4'] = crsp2['comlow4'].str.replace("united states", "us")
crsp2.loc[crsp2['COMNAM']=="B T GROUP PLC", "comlow4"] = "british telecommunications group"
crsp2.loc[crsp2['COMNAM']=="C A INC", "comlow4"] = "computer associates"
crsp2.loc[crsp2['COMNAM']=="CRUTCHER RESOURCES CORP", "comlow4"] = "crc"
crsp2.loc[crsp2['COMNAM']=="ECOLAB INC", "comlow4"] = "economics laboratory"
crsp2.loc[crsp2['COMNAM']=="EDUCATIONAL COMPUTER CORP DE", "comlow4"] = "ecc"
crsp2.loc[crsp2['COMNAM']=="ELECTROMAGNETIC SCIENCES INC", "comlow4"] = "ems"
crsp2.loc[crsp2['COMNAM']=="N E C CORP", "comlow4"] = "nippon electric"
crsp2.loc[crsp2['COMNAM'].str.contains("NETAPP"), "comlow4"] = "network appliance"
crsp2.loc[crsp2['COMNAM']=="P G & E CORP", "comlow4"] = "pacific gas electric"
crsp2.loc[crsp2['COMNAM']=="R B & W CORP", "comlow4"] = "russell burdsall ward"
crsp2.loc[crsp2['COMNAM']=="P M F G INC", "comlow4"] = "peerless manufacturing"
crsp2.loc[crsp2['COMNAM']=="NNA/S", "comlow4"] = "novo nordisk"
crsp2.loc[crsp2['COMNAM']=="S G S THOMSON MICROELECTRONICS", "comlow4"] = "stmicroelectronics"
crsp2.loc[crsp2['COMNAM']=="SOUTHWEST SECURITIES GROUP INC", "comlow4"] = "sws group"
crsp2.loc[crsp2['COMNAM']=="T D K ELECTRONICS LTD", "comlow4"] = "tokyo denki kagaku"
crsp2.loc[crsp2['COMNAM']=="TESCO AMERICAN INC", "comlow4"] = "transnational energy systems"
crsp2.loc[crsp2['COMNAM']=="U A L CORP", "comlow4"] = "united airlines"
crsp2.loc[crsp2['COMNAM']=="U S G CORP", "comlow4"] = "united states gypsum"
crsp2.loc[crsp2['COMNAM']=="WABTEC CORP", "comlow4"] = "westinghouse air brake technologies"


crsp2.loc[crsp2['COMNAM'].str.contains("NNA/S"), "comlow4"] = "novo nordisk"
crsp2.loc[crsp2['COMNAM'].str.contains("FEDEX CORP"), "comlow4"] = "federal express"
crsp2.loc[crsp2['COMNAM'].str.contains("NMS Communications Corporation"), "comlow4"] = "natural microsystems"
crsp2['comlow4'] = crsp2['comlow4'].str.replace("gatx","general american transporation")

crsp2['comlow4'] = crsp2['comlow4'].str.replace("nortel","northern telecommunications")


crsp2.loc[crsp2['comlow4'].str.contains("rhone poulenc"), "comlow4"] = "rhone poulenc"
crsp2.loc[crsp2['COMNAM'].str.contains("IT&T Industries, Inc."), "comlow4"] = "international telephone telegraph"
crsp2.loc[crsp2['comlow4'].str.contains("alcatel"), "comlow4"] = "alcatel"



In [100]:
#check at end of comp name
def cleanlow(column, col):

    compend = ['AB', 'AKTIEBOLAG', 'AKTIEBOLAGET', 'AKTIENGESELLSCHAFT','AS',
            'CO', 'COMPANIES','COMPANY','COR','CORP', 'COS',
               'DE', 'DEL', 'FSB', 'IN','INC', 'INCORPORATED','INTERNATIONAL','INTL',
             'LIMITEDLLC','LP','LTD','NEW','NV','PLC','SA','SAB DE CV','SHS','SPA','THE','AND',
               'ASA','OY','OYJ']

    compend2 = []
    for i in compend:
        compend2.append(" " +i.lower())
        

    for each in compend2:
        if (len(crsp2[crsp2[column].str.endswith(each)]) >0 ):
            
                print(each,len(crsp2[crsp2[column].str.endswith(each)]) )
                
                y = each+"$"
                crsp2.loc[crsp2[column].str.endswith(each),col] = crsp2.loc[crsp2[column].str.
                                                                                  endswith(each),column].str.replace(y,"")

                crsp2[column] = crsp2[column].apply(lambda x : x.strip())

In [101]:
crsp2[crsp2['COMNAM'].str.contains('NORTEL')]

Unnamed: 0,PERMNO,COMNAM,PERMCO,crsp_year,COMNAM2,comlow,comlow2,last,len,comlow3,comlen,last2,comlow4
2293138,58640,NORTEL NETWORKS CORP,21301,1999,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293147,58640,NORTEL NETWORKS CORP,21301,2000,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293151,58640,NORTEL NETWORKS CORP NEW,21301,2000,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293159,58640,NORTEL NETWORKS CORP NEW,21301,2001,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293171,58640,NORTEL NETWORKS CORP NEW,21301,2002,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293183,58640,NORTEL NETWORKS CORP NEW,21301,2003,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293195,58640,NORTEL NETWORKS CORP NEW,21301,2004,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293207,58640,NORTEL NETWORKS CORP NEW,21301,2005,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293219,58640,NORTEL NETWORKS CORP NEW,21301,2006,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks
2293231,58640,NORTEL NETWORKS CORP NEW,21301,2007,NORTEL NETWORKS,nortel networks,nortel networks,networks,2,nortel networks,15,networks,northern telecommunications networks


In [102]:
crsp2['comlow4'] = crsp2['comlow4'].apply(lambda x: x.strip())

In [103]:
crsp2.columns

Index(['PERMNO', 'COMNAM', 'PERMCO', 'crsp_year', 'COMNAM2', 'comlow',
       'comlow2', 'last', 'len', 'comlow3', 'comlen', 'last2', 'comlow4'],
      dtype='object')

In [104]:
# del crsp2['last'], crsp2['last2'], crsp2['len'], crsp2['comlen']

In [105]:
crsp2['comlow4'].nunique() #34791

34791

In [106]:
cleanlow("comlow4","comlow4")

 as 4
 company 28
 cor 4
 de 26
 fsb 8
 international 35
 and 1


In [107]:
crsp2['comlow4'] = crsp2['comlow4'].str.replace("united states","us")

crsp2['comlow4'] = crsp2['comlow4'].str.replace(" entnmt"," entertainment")
crsp2['comlow4'] = crsp2['comlow4'].str.replace(" frag"," fragrances")


In [108]:
crsp2.loc[crsp2['COMNAM']=="C COR INC","comlow3"] = "c cor"

In [110]:
lst = [  "technology", "investment", "systems",
       "information","response",  "capital", "mills", "llc"]

crsp2.loc[crsp2['comlow4'].isin(lst),"comlow4"] = "nan"

In [115]:
crsp2 = crsp2[~(crsp2.comlow4=="nan")]

In [116]:
crsp2.to_csv("crsp_clean_26Apr20.csv", index = False)