In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import statsmodels.api as sm


  _empty_series = pd.Series()


In [4]:
# Load the dataset characteristics
characteristic = pd.read_csv('datashare.CSV')

#Make the date the index
characteristic.index = characteristic['DATE']

#Format it as DateTime
characteristic.index = pd.to_datetime(characteristic.index, format='%Y%m%d')

In [33]:
# Create column with a value for every month
characteristic['monthly'] = characteristic.index.to_period('M')

#Function to select the top N firms based on a specific column
def top_n_firms(df, n, column='mvel1'):
    return df.sort_values(by=column, ascending=False).head(n)

#Creating dataframe with the top 500 biggest firms by month.
top_500_firms_monthly = characteristic.groupby('monthly').apply(top_n_firms, n=500)

#Previous function created additional index on month, this gets rid of the index
top_500_firms_monthly = top_500_firms_monthly.reset_index(level='monthly', drop=True)

# Removing the columns DATE and monthly
top_500_firms_monthly = top_500_firms_monthly.drop(["DATE", "monthly"], axis=1)

# Assuming the DataFrame is already grouped by month, if not, use groupby('month')
monthly_sum_mvel1 = top_500_firms_monthly.groupby(level='DATE')['mvel1'].sum()

# Create a new column 'weight' by dividing 'mvel1' by the monthly sum of 'mvel1'
top_500_firms_monthly['weight'] = top_500_firms_monthly['mvel1'] / top_500_firms_monthly.index.map(monthly_sum_mvel1)

In [74]:
top_500_firms_monthly

Unnamed: 0_level_0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,monthly,weight
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1957-01-31,12079,1.220639e+07,0.970338,0.941557,0.003975,11.982824,0.023975,0.046433,0.011494,-0.011848,...,0.015315,1.014458e-08,0.025862,0.012795,0.249423,0.020482,1.512433e-06,37.0,1957-01,0.060141
1957-01-31,11850,1.153415e+07,1.091870,1.192180,-0.087734,11.982824,0.020358,0.046433,0.042129,0.006240,...,0.010303,5.288339e-09,0.035477,0.010849,0.227785,0.028696,1.056100e-06,29.0,1957-01,0.056829
1957-01-31,10401,1.077829e+07,0.356425,0.127039,-0.007586,11.982824,0.006234,0.046433,0.025092,-0.009652,...,0.004239,1.746008e-09,0.014053,0.003858,0.374861,0.061228,7.925874e-07,48.0,1957-01,0.053105
1957-01-31,11703,8.769161e+06,1.125589,1.266951,0.004104,11.982824,0.019420,0.046433,0.059066,-0.121581,...,0.013444,1.142168e-08,0.042781,0.015885,0.327313,0.051603,9.674371e-07,28.0,1957-01,0.043206
1957-01-31,12060,5.225362e+06,1.067303,1.139136,-0.068510,11.787028,0.024554,0.046433,0.025316,-0.020016,...,0.011841,1.153844e-08,0.016878,0.008290,0.229225,0.023876,1.101224e-06,99.0,1957-01,0.025746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-31,14983,1.932681e+07,2.182903,4.765065,-0.396635,18.109666,0.101608,0.501727,-0.005058,-0.187382,...,0.054925,9.525978e-11,0.075135,0.042451,0.490677,9.937865,5.189924e-09,59.0,2021-12,0.000430
2021-12-31,23819,1.932555e+07,1.881373,3.539564,-0.393557,17.893346,0.061014,1.447941,-0.136054,0.117819,...,0.029418,8.670976e-11,0.030507,0.025184,0.256030,3.027328,9.638035e-09,13.0,2021-12,0.000430
2021-12-31,14882,1.925462e+07,2.821339,7.985819,-0.739113,17.443011,0.077664,0.491455,-0.177142,0.018706,...,0.045002,8.306820e-11,0.040338,0.026980,0.333120,5.190913,8.065813e-09,79.0,2021-12,0.000429
2021-12-31,13379,1.921323e+07,1.031372,1.063729,0.404756,17.665754,0.053352,0.406290,-0.265866,0.359901,...,0.034447,6.288796e-11,0.029189,0.043879,0.613794,16.451724,6.483882e-09,73.0,2021-12,0.000428


In [46]:
unique_permno_per_month = top_500_firms_monthly.groupby('monthly')['permno'].nunique()

In [53]:
print(unique_permno_per_month.unique())

monthly
1957-01    500
1957-02    500
1957-03    500
1957-04    500
1957-05    500
          ... 
2021-08    500
2021-09    500
2021-10    500
2021-11    500
2021-12    500
Freq: M, Name: permno, Length: 780, dtype: int64


In [75]:
print(top_500_firms_monthly["permno"].unique())

[12079 11850 10401 ... 22265 88439 13405]


In [76]:
np.savetxt('output.txt', top_500_firms_monthly["permno"].unique(), fmt='%d')

In [77]:
# Load the dataset 
returns = pd.read_csv('Permno_date_return.CSV')

In [78]:
returns.head()

Unnamed: 0,PERMNO,date,RET
0,10006,1957-01-31,0.064378
1,10006,1957-02-28,0.002016
2,10006,1957-03-29,0.018405
3,10006,1957-04-30,-0.008032
4,10006,1957-05-31,0.004049
