In [1]:
import numpy as np
import pandas as pd

import pickle

In [2]:
# read the dictionary of CIK codes
with open('CIK_year.txt', 'rb') as fp:
    CIK = pickle.load(fp)
    

# convert all CIK codes in the dictionary to strings
for key, value in CIK.items():
    CIK[key] = [str(cik) for cik in value]
            

# read the csv of data
data = pd.read_csv('data.csv', dtype={'Instrument': 'str'})


In [3]:
print(type(data.Instrument[15]))

data.shape

<class 'str'>


(345740, 29)

In [4]:
unique_CIKs = len(list(set(data.Instrument.to_list())))
print(unique_CIKs)
len(data.Instrument.unique())

15221


15221

In [5]:
# creat a list to add all CIK codes to
CIK_list = []

# iterating over the dictionary items and add CIK lists to our list
for key, value in CIK.items():
    CIK_list += value

# remove duplicated elements
CIK_list = list(set(CIK_list))

print('There are', len(CIK_list), 'CIK codes in our list.')

type(CIK_list[0])


There are 2116 CIK codes in our list.


str

In [6]:
# drop non-quantitative columns
data_cl = data.drop(columns=['NAICS Industry Group Name', 'Delisted Quote Flag', 'Company Common Name',
                                    'CIK Number', 'Instrument Is Active Flag', 'Bankrupt'])
# check on the data shape
print(data_cl.shape)

data_cl.head()

(345740, 23)


Unnamed: 0,Instrument,Total Debt,Accounts Payable (CF),Inventories (CF),Retained Earnings (Accumulated Deficit),Net Debt Incl. Pref.Stock & Min.Interest,Total Current Assets,Total Liabilities,Total Equity,Tangible Book Value - Banks,...,Cash and Short Term Investments,Market Value for Company,Tangible Book Value Per Share,Net Sales,Operating Income,Tangible Book Value - Utility,"Total Assets, Reported",Depreciation And Amortization,Total Current Liabilities,Year
0,910655,1866000.0,-235000.0,-9289000.0,49741000.0,-8580000.0,82689000.0,23250000.0,94489000.0,87124000.0,...,10446000.0,195540000.0,8.43993,360326000.0,40075000.0,87124000.0,117739000.0,413000.0,19209000.0,2000
1,1116521,10000000.0,-47000000.0,81000000.0,,-184000000.0,3043000000.0,2422000000.0,1817000000.0,1556000000.0,...,194000000.0,,5.59129,8268000000.0,369000000.0,1556000000.0,4239000000.0,,1597000000.0,2000
2,895651,377000.0,,289000.0,-18251000.0,-1483000.0,9479000.0,13186000.0,18959000.0,3622000.0,...,1860000.0,,0.4067,54971000.0,-1056000.0,3622000.0,32145000.0,,6303000.0,2000
3,1398702,,,,,,,,,,...,,,,,,,,,,2000
4,1434621,,,,,,,,,,...,,,,,,,,,,2000


In [8]:
# drop the all companies but the ones in the CIK list
data_cl = data_cl[data_cl.Instrument.isin(CIK_list)]

print('Data df shape:', data_cl.shape, '\n')

print('If every thing is okay, then the number of unique CIK in semi-clean data,', len(data_cl.Instrument.unique()))

print()
print('Let\'s check:')
print(len(data_cl.Instrument.unique()))


data_cl.head()

Data df shape: (59540, 23) 

If every thing is okay, then the number of unique CIK in semi-clean data, 2116

Let's check:
2116


Unnamed: 0,Instrument,Total Debt,Accounts Payable (CF),Inventories (CF),Retained Earnings (Accumulated Deficit),Net Debt Incl. Pref.Stock & Min.Interest,Total Current Assets,Total Liabilities,Total Equity,Tangible Book Value - Banks,...,Cash and Short Term Investments,Market Value for Company,Tangible Book Value Per Share,Net Sales,Operating Income,Tangible Book Value - Utility,"Total Assets, Reported",Depreciation And Amortization,Total Current Liabilities,Year
0,910655,1866000.0,-235000.0,-9289000.0,49741000.0,-8580000.0,82689000.0,23250000.0,94489000.0,87124000.0,...,10446000.0,195540000.0,8.43993,360326000.0,40075000.0,87124000.0,117739000.0,413000.0,19209000.0,2000
1,1116521,10000000.0,-47000000.0,81000000.0,,-184000000.0,3043000000.0,2422000000.0,1817000000.0,1556000000.0,...,194000000.0,,5.59129,8268000000.0,369000000.0,1556000000.0,4239000000.0,,1597000000.0,2000
3,1398702,,,,,,,,,,...,,,,,,,,,,2000
4,1434621,,,,,,,,,,...,,,,,,,,,,2000
5,1367311,,,,,,,,,,...,,,,,,,,,,2000


In [None]:
data.describe()


In [None]:
data_cl.describe()

In [None]:
# just checking...
data_cl.sort_values(by=['Instrument', 'Year'])


In [None]:
# check the type of Instrument entries
print('Datatype of the Instrument column in our datafram:')
print(type(data_cl.Instrument[15]))
print('Datatype of the Year column in our datafram:')
print(type(data_cl.Year[15]))
print()

# check the type of CIK entries in the CIK dictionary
for key, value in CIK.items():
    print('Datatype of the CIK in our dictionary:')
    print(type(value[0]))
    print('Datatype of the Year in our dictionary:')
    print(type(key))
    break
    

In [None]:
# iterate over the dictionary, and for each year concatanate 
# the corresponding five years for each company in the list
# to the final dataframe: datum.

# create an empty df with the data columns
datum = pd.DataFrame(columns=data_cl.columns)

# iterate over the dictionary
for key, value in CIK.items():
    # for each company (cik) in the list of the year (key)
    for cik in value:
        # take rows of the data for that cik code and the same year as key
        df = data_cl[(data_cl.Instrument == cik) & (data_cl.Year == str(key))]
        datum = pd.concat([datum, df])

print(datum.shape)
datum.head()

In [None]:
# just checking...

print(datum.shape)

datum.sort_values(by=['Instrument', 'Year'])


In [None]:
datum.drop_duplicates(inplace=True)

# checking for fishy duplicated rows
check_dup = datum.dropna()
check_dup = check_dup.groupby(['Instrument', 'Year']).count()

# check for duplicates
for col in check_dup.columns:
    print(check_dup[col].value_counts())
    print()

print(datum.shape)

So, pretty shitty!

In [None]:
# write the datum as csv file
# and read it again, sorry!
# just that it's easier than to convert all factors to integer

datum.to_csv('datum.csv', index=False)
datum = pd.read_csv('datum.csv')

datum.head()

In [None]:
# first check if we actually have all CIK in the dictionary
# and for all the required years

# for each year (key)
for key, value in CIK.items():
    # for each company (cik) in that year's list
    for cik in value:
        df = data_cl[(data_cl.Instrument == cik) & (data_cl.Year == str(key))]
        df.groupy()
    