# Data Cleaning

In [32]:
import pandas as pd
import numpy as np
import pyreadr
import warnings
warnings.filterwarnings('ignore')


In [33]:
# load the data
equity_df = pd.read_csv('../Data/monthly_equity_combined.csv', encoding='latin1')

# rename the first column and drop the last column
equity_df.rename(columns={equity_df.columns[0]: 'TradeDate'}, inplace=True)
equity_df.drop(columns=equity_df.columns[-1], inplace=True)

equity_df


Unnamed: 0,TradeDate,SecurityId,Symbol,ISIN,SecurityName,SecurityTypeId,SecurityType,IsStock,Market,CompanyId,...,LogReturnAdjGeneric,OffShareTurnover,OffTurnover,NonOffShareTurnover,NonOffTurnover,SharesIssued,DivFactor,CumDivFactor,LastQAccount,LastYAccount
0,1980-01-02 00:00:00,6000,NET,NO0003069908,Nettbuss Sør,1,Ordinary Shares,1,OSE,2214.0,...,,,,,,4000.0,,0.897183,,
1,1980-01-02 00:00:00,6006,AFK,NO0003572802,Arendals Fossekompani,1,Ordinary Shares,1,OSE,1007.0,...,,,,,,0.0,,0.203478,,
2,1980-01-02 00:00:00,6007,AKE,NO0003514002,Aker RGI A,2,A Shares,1,OSE,1939.0,...,,,,,,0.0,,0.611575,,
3,1980-01-02 00:00:00,6019,AWS,NO0003083107,Awilco ser. A,2,A Shares,1,OSE,2218.0,...,,,,,,0.0,,0.382215,,
4,1980-01-02 00:00:00,6026,BEL,NO0003094104,Belships,1,Ordinary Shares,1,OSE,2221.0,...,,,,,,0.0,,0.586987,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162127,2020-11-27 00:00:00,1305295,BWE,BMG0702P1086,BW Energy Limited,1,Ordinary Shares,1,OSE,12748.0,...,0.392772,5355947.0,1.061510e+08,2654797.0,5.541827e+07,234304300.0,,1.000000,,
162128,2020-11-27 00:00:00,1305313,NOL,BMG6682J1036,Northern Ocean Ltd.,1,Ordinary Shares,1,OSE,12750.0,...,0.566395,9154182.0,6.328005e+07,100844.0,6.092200e+05,63802378.0,,1.000000,,
162129,2020-11-27 00:00:00,1305435,PEXIP,NO0010840507,Pexip Holding,1,Ordinary Shares,1,OSE,12767.0,...,-0.105278,13306870.0,8.377522e+08,1590196.0,1.018904e+08,101563487.0,,1.000000,,
162130,2020-11-27 00:00:00,1305713,LINK,NO0010894231,Link Mobility Group Holding,1,Ordinary Shares,1,OSE,12811.0,...,-0.019803,3439046.0,1.826526e+08,4001558.0,2.146335e+08,270911039.0,,1.000000,,


In [34]:
# standardize ISIN codes
equity_df['ISIN'] = equity_df['ISIN'].str.upper().str.strip()

# align the monthly dates to the end of the month
equity_df['TradeDate'] = pd.to_datetime(equity_df['TradeDate'], errors='coerce') + pd.offsets.MonthEnd(0)

equity_df

Unnamed: 0,TradeDate,SecurityId,Symbol,ISIN,SecurityName,SecurityTypeId,SecurityType,IsStock,Market,CompanyId,...,LogReturnAdjGeneric,OffShareTurnover,OffTurnover,NonOffShareTurnover,NonOffTurnover,SharesIssued,DivFactor,CumDivFactor,LastQAccount,LastYAccount
0,1980-01-31,6000,NET,NO0003069908,Nettbuss Sør,1,Ordinary Shares,1,OSE,2214.0,...,,,,,,4000.0,,0.897183,,
1,1980-01-31,6006,AFK,NO0003572802,Arendals Fossekompani,1,Ordinary Shares,1,OSE,1007.0,...,,,,,,0.0,,0.203478,,
2,1980-01-31,6007,AKE,NO0003514002,Aker RGI A,2,A Shares,1,OSE,1939.0,...,,,,,,0.0,,0.611575,,
3,1980-01-31,6019,AWS,NO0003083107,Awilco ser. A,2,A Shares,1,OSE,2218.0,...,,,,,,0.0,,0.382215,,
4,1980-01-31,6026,BEL,NO0003094104,Belships,1,Ordinary Shares,1,OSE,2221.0,...,,,,,,0.0,,0.586987,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162127,2020-11-30,1305295,BWE,BMG0702P1086,BW Energy Limited,1,Ordinary Shares,1,OSE,12748.0,...,0.392772,5355947.0,1.061510e+08,2654797.0,5.541827e+07,234304300.0,,1.000000,,
162128,2020-11-30,1305313,NOL,BMG6682J1036,Northern Ocean Ltd.,1,Ordinary Shares,1,OSE,12750.0,...,0.566395,9154182.0,6.328005e+07,100844.0,6.092200e+05,63802378.0,,1.000000,,
162129,2020-11-30,1305435,PEXIP,NO0010840507,Pexip Holding,1,Ordinary Shares,1,OSE,12767.0,...,-0.105278,13306870.0,8.377522e+08,1590196.0,1.018904e+08,101563487.0,,1.000000,,
162130,2020-11-30,1305713,LINK,NO0010894231,Link Mobility Group Holding,1,Ordinary Shares,1,OSE,12811.0,...,-0.019803,3439046.0,1.826526e+08,4001558.0,2.146335e+08,270911039.0,,1.000000,,


In [35]:
# check the security types
print(equity_df['SecurityType'].unique())

# check the distribution of the security types
print(equity_df['SecurityType'].value_counts())

['Ordinary Shares' 'A Shares' 'B Shares' 'New Shares' 'Other'
 'Converted Shares' 'Free Shares' 'Primary Capital Certificates'
 'Warrant - Tegningsrett' 'Converted F Shares' 'Converted B Shares'
 'New B Shares' 'Preference Shares' 'Converted A Shares'
 'Converted Primary Capital Certificates' 'Warrant - European Call'
 'Warrant - European Put' 'Warrant - Index Warrant'
 'Exchange traded funds' 'Warrant - American Call'
 'Warrant - Exchange tradable notes' 'Warrant - Bull ETN'
 'Warrant - Bear ETN']
SecurityType
Ordinary Shares                           78878
Warrant - European Call                   25158
Other                                     19236
Warrant - European Put                     9208
Primary Capital Certificates               6157
Warrant - Bull ETN                         5185
Warrant - Bear ETN                         5174
B Shares                                   4296
A Shares                                   3570
Warrant - Exchange tradable notes          1823
Exc

In [36]:
# define the relevant equity types
keep_types = [
    'Ordinary Shares', 'A Shares', 'B Shares', 'Free Shares', 
    'Primary Capital Certificates', 'Converted Shares', 'Preference Shares'
]

# filter the DataFrame to keep only the relevant equity types
equity_df = equity_df[equity_df['SecurityType'].isin(keep_types)]

print(equity_df['SecurityType'].unique())

['Ordinary Shares' 'A Shares' 'B Shares' 'Converted Shares' 'Free Shares'
 'Primary Capital Certificates' 'Preference Shares']


In [59]:
# find the last date for each ISIN
last_dates = equity_df.groupby('ISIN')['TradeDate'].max().reset_index()

# merge the last dates back to the original DataFrame
equity_df_last = pd.merge(equity_df, last_dates, on='ISIN', suffixes=('', '_Last'))

# see the distribution of the last dates
equity_df_last.groupby('ISIN')['TradeDate_Last'].first().value_counts().sort_index()

TradeDate_Last
1985-03-31      1
1985-09-30      1
1986-03-31      2
1986-04-30      2
1986-05-31      3
             ... 
2020-03-31      1
2020-05-31      1
2020-07-31      2
2020-09-30      1
2020-11-30    219
Name: count, Length: 300, dtype: int64

In [52]:
equity_df['ISIN'].nunique()

877

In [65]:
# check for the price at the end of last date for each ISIN
last_prices = equity_df_last.groupby('ISIN').apply(lambda x: x.loc[x['TradeDate'] == x['TradeDate_Last'], 'Last'].iloc[0] if not x.loc[x['TradeDate'] == x['TradeDate_Last'], 'Last'].empty else np.nan).reset_index(name='Last_Price')

# filter for those that do not have last trade date as the last date in the dataset
last_prices = last_prices[~last_prices['ISIN'].isin(equity_df_last[equity_df_last['TradeDate_Last'] == equity_df_last['TradeDate'].max()]['ISIN'])]

# count how many ISINs have the last price under 5 NOK or missing
num_under_5 = (last_prices['Last_Price'] < 5).sum()
num_under_5_missing = last_prices['Last_Price'].isna().sum()

num_under_5_missing + num_under_5

last_prices



Unnamed: 0,ISIN,Last_Price
0,ANN7425Q1095,75.50
2,BE0003806230,33.00
4,BMG0539N1020,
7,BMG0992J1018,
8,BMG1224A1080,
...,...,...
872,US45665B1061,32.40
873,US58446U2024,
874,US8938171068,191.50
875,USU872831040,0.23


In [68]:
equity_df[equity_df['ISIN'] == 'BE0003806230']

Unnamed: 0,TradeDate,SecurityId,Symbol,ISIN,SecurityName,SecurityTypeId,SecurityType,IsStock,Market,CompanyId,...,LogReturnAdjGeneric,OffShareTurnover,OffTurnover,NonOffShareTurnover,NonOffTurnover,SharesIssued,DivFactor,CumDivFactor,LastQAccount,LastYAccount
39618,2000-07-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,,,,,,5403713.0,,1.0,,
39849,2000-07-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,,9800.0,2127400.0,,,5403713.0,,1.0,,
40081,2000-08-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.189621,20800.0,3744350.0,512.0,92594.0,5403713.0,,1.0,,
40314,2000-09-30,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.098061,42864.0,7334050.0,240.0,41051.0,5403713.0,,1.0,,
40543,2000-10-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.09531,9100.0,1453150.0,149.0,23745.0,5403713.0,,1.0,,
40771,2000-11-30,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.068993,24850.0,3612425.0,4194.0,644150.0,5403713.0,,1.0,,
41001,2000-12-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.154151,68562.0,7070400.0,35386.0,3190833.0,5403713.0,,1.0,,
41228,2001-01-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,0.080043,34068.0,4036660.0,374.0,44561.0,5403713.0,,1.0,,9793.0
41454,2001-02-28,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.039221,23850.0,3046550.0,37829.0,3522584.0,5403713.0,,1.0,,9793.0
41687,2001-03-31,46405,ZENT,BE0003806230,Zenitel,1,Ordinary Shares,1,OSE,6429.0,...,-0.174353,13500.0,1537200.0,4643.0,565895.0,5441122.0,,1.0,,9793.0


In [48]:
# check which colmns are completely empty
empty_columns = equity_df_last.columns[equity_df_last.isnull().all()]

print(f"Empty columns: {empty_columns.tolist()}")


Empty columns: ['ReturnLast', 'ReturnAdjLast', 'LogReturnLast', 'LogReturnAdjLast']


In [41]:
# do any symbols have multiple security types?
symbol_counts = equity_df.groupby('Symbol')['SecurityType'].nunique()
symbols_multiple_types = symbol_counts[symbol_counts > 1].index.tolist()
print(f"Symbols with multiple security types: {symbols_multiple_types}")

Symbols with multiple security types: ['ADE', 'ELE', 'SOR']


In [43]:
# show the rows for the symbols with multiple security types
equity_df_multiple_types = equity_df[equity_df['Symbol'].isin(symbols_multiple_types)]
print(equity_df_multiple_types[['Symbol', 'ISIN', 'SecurityType']].drop_duplicates())

       Symbol          ISIN                  SecurityType
2102      ELE  NO0003538530                      B Shares
4854      SOR  NO0003001000               Ordinary Shares
21120     ADE  NO0003031809                      A Shares
33663     SOR  NO0006001502  Primary Capital Certificates
59748     ELE  NO0003055808               Ordinary Shares
144689    ADE  NO0010844038                      B Shares
