# Predicting Fallen Angels

In [1]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

**Importing Data and Finding Fallen Angels**

In [2]:
ratings = pd.read_csv('ratingdata.csv')

In [3]:
ratings.head()

Unnamed: 0,gvkey,splticrm,spsdrm,spsticrm,datadate,gsector,spcindcd,tic
0,1003,,,,2004/06/30,25.0,449.0,ANTQ
1,1003,,,,2004/07/31,25.0,449.0,ANTQ
2,1003,,,,2004/08/31,25.0,449.0,ANTQ
3,1003,,,,2004/09/30,25.0,449.0,ANTQ
4,1003,,,,2004/10/31,25.0,449.0,ANTQ


In [4]:
ratings.splticrm.isna().sum()

2174476

In [5]:
len(ratings)

2804574

In [6]:
ratings['year'] = ratings.datadate.apply(lambda x : int(x[:4]))

In [7]:
def junk_or_inv(x):
    rating_list = ['AAA', 'AA+', 'AA', 'AA-', 'A+', 'A', 'A-', 'BBB+', 'BBB']
    lower_ratings = ['BBB-', 'BB+', 'BB', 'BB-']
    
    if x in rating_list:
        return 1
    if x in lower_ratings:
        return 0
    else:
        return np.nan

In [8]:
ratings['bondgrade'] = ratings.splticrm.apply(junk_or_inv)

In [9]:
comp_type = ratings.groupby('gvkey')['bondgrade'].mean()

In [10]:
#comp_type[comp_type > 0].dropna().sort_values().to_csv('gvkey_scraping_data.csv')

In [11]:
comp_type

gvkey
1003           NaN
1004      0.349693
1009           NaN
1010           NaN
1011           NaN
            ...   
306398         NaN
308992         NaN
311524         NaN
311798         NaN
316056    0.000000
Name: bondgrade, Length: 25022, dtype: float64

In [12]:
comp_type.reset_index()

Unnamed: 0,gvkey,bondgrade
0,1003,
1,1004,0.349693
2,1009,
3,1010,
4,1011,
...,...,...
25017,306398,
25018,308992,
25019,311524,
25020,311798,


In [13]:
comp_type_year = ratings.groupby(['gvkey', 'year'])['bondgrade'].mean()

In [14]:
comp_type_year

gvkey   year
1003    2004    NaN
        2005    NaN
        2006    NaN
        2007    NaN
        2008    NaN
               ... 
316056  2013    0.0
        2014    0.0
        2015    0.0
        2016    0.0
        2017    0.0
Name: bondgrade, Length: 258898, dtype: float64

In [15]:
comp_type_year.reset_index()

Unnamed: 0,gvkey,year,bondgrade
0,1003,2004,
1,1003,2005,
2,1003,2006,
3,1003,2007,
4,1003,2008,
...,...,...,...
258893,316056,2013,0.0
258894,316056,2014,0.0
258895,316056,2015,0.0
258896,316056,2016,0.0


In [16]:
ratings_new = pd.merge(ratings[['gvkey', 'year', 'datadate', 'gsector', 'splticrm']], comp_type_year, on = ['gvkey', 'year'])

In [17]:
ratings_new

Unnamed: 0,gvkey,year,datadate,gsector,splticrm,bondgrade
0,1003,2004,2004/06/30,25.0,,
1,1003,2004,2004/07/31,25.0,,
2,1003,2004,2004/08/31,25.0,,
3,1003,2004,2004/09/30,25.0,,
4,1003,2004,2004/10/31,25.0,,
...,...,...,...,...,...,...
2804569,316056,2016,2016/10/31,20.0,BBB-,0.0
2804570,316056,2016,2016/11/30,20.0,BBB-,0.0
2804571,316056,2016,2016/12/31,20.0,BBB-,0.0
2804572,316056,2017,2017/01/31,20.0,BBB-,0.0


In [18]:
ratings_new = ratings_new.dropna(subset = ['bondgrade'])
ratings_new = ratings_new[ratings_new['bondgrade'] != 0]

In [19]:
ratings_new

Unnamed: 0,gvkey,year,datadate,gsector,splticrm,bondgrade
102,1004,1990,1990/01/31,20.0,BBB,1.0
103,1004,1990,1990/02/28,20.0,BBB,1.0
104,1004,1990,1990/03/31,20.0,BBB,1.0
105,1004,1990,1990/04/30,20.0,BBB,1.0
106,1004,1990,1990/05/31,20.0,BBB,1.0
...,...,...,...,...,...,...
2803847,287882,2016,2016/10/31,10.0,BBB,1.0
2803848,287882,2016,2016/11/30,10.0,BBB,1.0
2803849,287882,2016,2016/12/31,10.0,BBB,1.0
2803850,287882,2017,2017/01/31,10.0,BBB,1.0


In [20]:
ratings_new['bondgradeshift'] = ratings_new.bondgrade.shift(-1)
ratings_new['gvkeyshift'] = ratings_new.gvkey.shift(-1)

In [21]:
def fallen_angel(x):
    if (x.bondgrade == 1) and (x.bondgradeshift < 1) and (x.gvkey == x.gvkeyshift):
        return True
    else:
        return False

In [22]:
ratings_new['fallen_angel'] = ratings_new.apply(fallen_angel, axis = 1)

In [23]:
ratings_new = ratings_new.drop(columns = ['bondgradeshift', 'gvkeyshift'])

In [24]:
ratings_new

Unnamed: 0,gvkey,year,datadate,gsector,splticrm,bondgrade,fallen_angel
102,1004,1990,1990/01/31,20.0,BBB,1.0,False
103,1004,1990,1990/02/28,20.0,BBB,1.0,False
104,1004,1990,1990/03/31,20.0,BBB,1.0,False
105,1004,1990,1990/04/30,20.0,BBB,1.0,False
106,1004,1990,1990/05/31,20.0,BBB,1.0,False
...,...,...,...,...,...,...,...
2803847,287882,2016,2016/10/31,10.0,BBB,1.0,False
2803848,287882,2016,2016/11/30,10.0,BBB,1.0,False
2803849,287882,2016,2016/12/31,10.0,BBB,1.0,False
2803850,287882,2017,2017/01/31,10.0,BBB,1.0,False


In [25]:
ratings_new.fallen_angel.value_counts()

False    315934
True        729
Name: fallen_angel, dtype: int64

In [26]:
fallen_angels = ratings_new.groupby(['gvkey', 'year'])['fallen_angel'].max()

In [27]:
fallen_angels = fallen_angels.reset_index()
fallen_angels

Unnamed: 0,gvkey,year,fallen_angel
0,1004,1990,False
1,1004,1991,False
2,1004,1992,False
3,1004,1993,False
4,1004,1994,True
...,...,...,...
27908,287882,2013,False
27909,287882,2014,False
27910,287882,2015,False
27911,287882,2016,False


In [28]:
data = pd.read_csv('Assignment1Data6.csv')

In [29]:
data.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,ajex,curcd,...,re,revt,wcap,xacc,xad,xrd,exchg,costat,prcc_f,ggroup
0,1000,1969/12/31,1969.0,INDL,C,D,STD,AE.2,1.0,USD,...,4.795,37.392,3.334,0.742,,,12.0,I,,
1,1000,1970/12/31,1970.0,INDL,C,D,STD,AE.2,1.0,USD,...,5.554,45.335,1.756,0.763,,,12.0,I,10.0,
2,1000,1971/12/31,1971.0,INDL,C,D,STD,AE.2,1.0,USD,...,3.08,47.033,11.181,1.195,,,12.0,I,5.75,
3,1000,1972/12/31,1972.0,INDL,C,D,STD,AE.2,1.0,USD,...,2.881,34.362,7.336,1.172,0.081,,12.0,I,5.125,
4,1000,1973/12/31,1973.0,INDL,C,D,STD,AE.2,1.0,USD,...,4.744,37.75,8.327,0.826,0.222,,12.0,I,1.75,


In [30]:
data_full = pd.merge(data, fallen_angels, left_on = ['gvkey', 'fyear'], right_on = ['gvkey', 'year'])

In [31]:
data_full.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,ajex,curcd,...,wcap,xacc,xad,xrd,exchg,costat,prcc_f,ggroup,year,fallen_angel
0,1004,1991/05/31,1990.0,INDL,C,D,STD,AIR,1.5,USD,...,189.172,23.589,,,11.0,A,14.125,2010.0,1990,False
1,1004,1992/05/31,1991.0,INDL,C,D,STD,AIR,1.5,USD,...,197.246,18.889,,,11.0,A,12.875,2010.0,1991,False
2,1004,1993/05/31,1992.0,INDL,C,D,STD,AIR,1.5,USD,...,193.399,11.693,,,11.0,A,13.5,2010.0,1992,False
3,1004,1994/05/31,1993.0,INDL,C,D,STD,AIR,1.5,USD,...,240.009,13.312,,,11.0,A,14.375,2010.0,1993,False
4,1004,1995/05/31,1994.0,INDL,C,D,STD,AIR,1.5,USD,...,248.492,15.977,,,11.0,A,15.25,2010.0,1994,True


In [32]:
data_new = data_full[data_full['fyear'] >= 2000]

In [33]:
data_new.fallen_angel.value_counts()

False    19939
True       611
Name: fallen_angel, dtype: int64

In [34]:
data_new.fyear.value_counts()

2001.0    1325
2002.0    1288
2000.0    1284
2003.0    1280
2005.0    1227
2004.0    1220
2006.0    1197
2007.0    1149
2008.0    1123
2009.0    1092
2014.0    1092
2013.0    1064
2015.0    1054
2010.0    1054
2012.0    1050
2011.0    1045
2016.0    1025
2017.0     981
Name: fyear, dtype: int64

In [35]:
data_new.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,ajex,curcd,...,wcap,xacc,xad,xrd,exchg,costat,prcc_f,ggroup,year,fallen_angel
8,1004,2001/05/31,2000.0,INDL,C,D,STD,AIR,1.0,USD,...,360.464,35.706,,,11.0,A,14.0,2010.0,2000,False
9,1004,2002/05/31,2001.0,INDL,C,D,STD,AIR,1.0,USD,...,286.192,54.563,,,11.0,A,11.44,2010.0,2001,False
10,1036,2000/12/31,2000.0,INDL,C,D,STD,UDI.,1.0,USD,...,397.145,204.08,,,11.0,I,12.1875,2010.0,2000,False
18,1048,2000/12/31,2000.0,INDL,C,D,STD,4267A,1.0,USD,...,172.0,,,,0.0,I,,1010.0,2000,False
19,1048,2001/12/31,2001.0,INDL,C,D,STD,4267A,1.0,USD,...,218.0,9.0,,,0.0,I,,1010.0,2001,True


In [36]:
data_new.isnull().sum()
df = data_new.copy()

In [37]:
## dropping unneeded cols here
df = df.drop(columns = ['indfmt', 'consol', 'popsrc', 'datafmt', 'costat'])
df = df.drop_duplicates(subset = ['gvkey', 'fyear'])

In [38]:
## cleaning 

# dropping csho = 0 or NaN
df = df[df['csho'] != 0]
# dropping revt = 0 or NaN
df = df[df['revt'] != 0]
df = df.dropna(subset=['revt', 'csho','tic','prcc_f'], how = 'any')
# dropping when csho < .1 bc having less 100k shares outstanding skews our findings drastically
df = df[df['csho'] > .1]



df['xad'] = df['xad'].fillna(0)
df['xrd'] = df['xrd'].fillna(0)


df = df.reset_index(drop=True)

In [39]:
df['ajex'] = df['ajex'].replace(0, 1)
df['ajex'] = df['ajex'].fillna(1)
df['csho_adj'] = df['csho'] / df['ajex']

In [40]:
new_df = pd.DataFrame(columns=df.columns)
for i in range(2000, 2021):
    df2 = df[df['fyear'] == i]
    df2['act'] = df2['act'].fillna(df2['act'].median())
    df2['lct'] = df2['lct'].fillna(df2['lct'].median())
    df2['bkvlps'] = df2['bkvlps'].fillna(df2['bkvlps'].median())
    df2['capx'] = df2['capx'].fillna(df2['capx'].median())
    df2['ceq'] = df2['ceq'].fillna(df2['ceq'].median())
    df2['ch'] = df2['ch'].fillna(df2['ch'].median())
    df2['cogs'] = df2['cogs'].fillna(df2['cogs'].median())
    df2['dltt'] = df2['dltt'].fillna(df2['dltt'].median())
    df2['dt'] = df2['dt'].fillna(df2['dt'].median())
    df2['dv'] = df2['dv'].fillna(df2['dv'].median())
    df2['ebit'] = df2['ebit'].fillna(df2['ebit'].median())
    df2['ebitda'] = df2['ebitda'].fillna(df2['ebitda'].median())
    df2['invt'] = df2['invt'].fillna(df2['invt'].median())
    df2['ni'] = df2['ni'].fillna(df2['ni'].median())
    df2['opeps'] = df2['opeps'].fillna(df2['opeps'].median())
    df2['re'] = df2['re'].fillna(df2['re'].median())
    df2['revt'] = df2['revt'].fillna(df2['revt'].median())
    df2['wcap'] = df2['wcap'].fillna(df2['wcap'].median())
    df2['xacc'] = df2['xacc'].fillna(df2['xacc'].median())
    
    new_df = new_df.append(df2)
    
    new_df['pm'] = new_df['ni'] / new_df['revt']
    new_df['atr'] = new_df['revt'] / new_df['at']
    new_df['fl'] = new_df['at'] / new_df['ceq']
    new_df['cr'] = new_df['act'] / new_df['lct']
    new_df['de'] = new_df['dt'] / new_df['ceq']
    new_df['roa'] = new_df['ni'] / new_df['at']

In [41]:
df = new_df.copy()

df = df[df['at'] != 0]
df = df[df['lct'] != 0]
df = df[df['act'] != 0]

df['actps'] = df['act'] / df['csho']
df['atps'] = df['at'] / df['csho']
df['lctps'] = df['lct'] / df['csho']
df['capxps'] = df['capx'] / df['csho']
df['ceqps'] = df['ceq'] / df['csho']
df['chps'] = df['ch'] / df['csho']
df['cogsps'] = df['cogs'] / df['csho']
df['dlttps'] = df['dltt'] / df['csho']
df['dtps'] = df['dt'] / df['csho']
df['dvps'] = df['dv'] / df['csho']
df['ebitps'] = df['ebit'] / df['csho']
df['ebitdaps'] = df['ebitda'] / df['csho']
df['invtps'] = df['invt'] / df['csho']
df['nips'] = df['ni'] / df['csho']
df['reps'] = df['re'] / df['csho']
df['revtps'] = df['revt'] / df['csho']
df['wcapps'] = df['wcap'] / df['csho']
df['xaccps'] = df['xacc'] / df['csho']
df['xadps'] = df['xad'] / df['csho']
df['xrdps'] =df['xrd'] / df['csho']
df['pmps'] = df['pm'] / df['csho']
df['atrps'] = df['atr'] / df['csho']
df['flps'] = df['fl'] / df['csho']
df['crps'] = df['cr'] / df['csho']
df['deps'] = df['de'] / df['csho']
df['roaps'] = df['roa'] / df['csho']

new_df = df.copy()

In [42]:
new_df.columns

Index(['gvkey', 'datadate', 'fyear', 'tic', 'ajex', 'curcd', 'fyr', 'act',
       'at', 'bkvlps', 'capx', 'ceq', 'ch', 'cogs', 'csho', 'dltt', 'dt', 'dv',
       'ebit', 'ebitda', 'invt', 'lct', 'ni', 'opeps', 're', 'revt', 'wcap',
       'xacc', 'xad', 'xrd', 'exchg', 'prcc_f', 'ggroup', 'year',
       'fallen_angel', 'csho_adj', 'pm', 'atr', 'fl', 'cr', 'de', 'roa',
       'actps', 'atps', 'lctps', 'capxps', 'ceqps', 'chps', 'cogsps', 'dlttps',
       'dtps', 'dvps', 'ebitps', 'ebitdaps', 'invtps', 'nips', 'reps',
       'revtps', 'wcapps', 'xaccps', 'xadps', 'xrdps', 'pmps', 'atrps', 'flps',
       'crps', 'deps', 'roaps'],
      dtype='object')

In [43]:
len(new_df)

13201

In [44]:
new_df.head()

Unnamed: 0,gvkey,datadate,fyear,tic,ajex,curcd,fyr,act,at,bkvlps,...,wcapps,xaccps,xadps,xrdps,pmps,atrps,flps,crps,deps,roaps
0,1004,2001/05/31,2000.0,AIR,1.0,USD,5.0,485.856,701.854,12.6299,...,13.381743,1.325537,0.0,0.0,0.000787,0.046243,0.076586,0.143843,0.019685,0.00098
2,1036,2000/12/31,2000.0,UDI.,1.0,USD,12.0,942.624,2325.377,23.5597,...,10.150671,5.216102,0.0,0.0,0.000611,0.026008,0.064478,0.044168,0.019636,0.000622
3,1075,2000/12/31,2000.0,PNW,1.0,USD,12.0,793.913,7149.151,28.0898,...,-4.684763,0.506384,0.0,0.0,0.000966,0.006085,0.035372,0.007856,0.006024,0.000499
16,1078,2000/12/31,2000.0,ABT,1.0,USD,12.0,7376.241,15283.254,5.5442,...,1.991483,1.261769,0.0,0.873921,0.000131,0.000582,0.001153,0.00111,0.0001,0.000118
49,1209,2000/09/30,2000.0,APD,1.0,USD,9.0,1805.0,8270.5,13.1702,...,2.008225,1.667452,0.0,0.580714,0.000106,0.003086,0.013684,0.006129,0.004625,7e-05


In [45]:
new_df.columns

Index(['gvkey', 'datadate', 'fyear', 'tic', 'ajex', 'curcd', 'fyr', 'act',
       'at', 'bkvlps', 'capx', 'ceq', 'ch', 'cogs', 'csho', 'dltt', 'dt', 'dv',
       'ebit', 'ebitda', 'invt', 'lct', 'ni', 'opeps', 're', 'revt', 'wcap',
       'xacc', 'xad', 'xrd', 'exchg', 'prcc_f', 'ggroup', 'year',
       'fallen_angel', 'csho_adj', 'pm', 'atr', 'fl', 'cr', 'de', 'roa',
       'actps', 'atps', 'lctps', 'capxps', 'ceqps', 'chps', 'cogsps', 'dlttps',
       'dtps', 'dvps', 'ebitps', 'ebitdaps', 'invtps', 'nips', 'reps',
       'revtps', 'wcapps', 'xaccps', 'xadps', 'xrdps', 'pmps', 'atrps', 'flps',
       'crps', 'deps', 'roaps'],
      dtype='object')

**Importing Sentiment Data from Filings**

In [46]:
import glob, os

In [47]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path + '/filing_sentiments/', "*.csv"))

sentiment_data = pd.DataFrame()

for f in csv_files:
    csv = pd.read_csv(f)
    sentiment_data = sentiment_data.append(csv)
    

In [48]:
sentiment_data

Unnamed: 0.1,Unnamed: 0,gvkey,date,report_type,sentiment_score,tot_pos,tot_neu,tot_neg,raw_text
0,0,10005,2000-12-31,10-K,0.491296,47,414,56,10-K 1 sr10k.htm STANDARD REGISTER FORM 10-K ...
1,0,10005,2001-12-30,10-K,0.530717,40,231,22,10-K 1 sr10k02.htm THE STANDARD REGISTER COMP...
2,0,10005,2002-12-29,10-K,0.473976,129,853,190,10-K 1 sr10k2002.htm UNITED STATES SECURITIES...
3,0,10005,2003-12-28,10-K,0.468210,148,897,229,10-K 1 sr10k20032.htm FORM 10-K UNITED STATES...
4,0,10005,2005-01-02,10-K,0.481028,165,887,213,10-K 1 sr10k2004.htm FORM 10-K UNITED STATES ...
...,...,...,...,...,...,...,...,...,...
12,0,9906,2011-12-31,10-K,0.449349,72,823,181,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...
13,0,9906,2012-12-31,10-K,0.444085,61,839,182,10-K 1 form10k.htm SOUTHWESTERN PUBLIC SERVIC...
14,0,9906,2013-12-31,10-K,0.440252,49,602,144,10-K 1 sps1231201310-k.htm 10-K SPS 12.31.201...
15,0,9906,2014-12-31,10-K,0.455610,72,595,144,10-K 1 sps1231201410-k.htm 10-K SPS 12.31.201...


In [49]:
sentiment_data = sentiment_data[pd.to_numeric(sentiment_data['gvkey'], errors='coerce').notnull()]

In [50]:
filing_data = sentiment_data[['gvkey', 'date', 'sentiment_score', 'tot_pos', 'tot_neu', 'tot_neg']]

In [51]:
filing_data['sum'] = filing_data['tot_neg'] + filing_data['tot_pos'] + filing_data['tot_neu']
filing_data = filing_data[filing_data['sum'] != 0]

filing_data['tot_neg'] = filing_data.tot_neg.replace(0, 1)

In [52]:
filing_data['pos_neg'] = filing_data['tot_pos'] / filing_data['tot_neg']
filing_data['pos_rat'] = filing_data['tot_pos'] / (filing_data['tot_neg'] + filing_data['tot_pos'] + filing_data['tot_neu'])

In [53]:
filing_data['year'] = filing_data['date'].apply(lambda x : int(x[:4]) + 1)
filing_data['year_2'] = filing_data['year'] + 1
filing_data['year_3'] = filing_data['year'] + 2

In [54]:
filing_data.head()

Unnamed: 0,gvkey,date,sentiment_score,tot_pos,tot_neu,tot_neg,sum,pos_neg,pos_rat,year,year_2,year_3
0,10005,2000-12-31,0.491296,47,414,56,517,0.839286,0.090909,2001,2002,2003
1,10005,2001-12-30,0.530717,40,231,22,293,1.818182,0.136519,2002,2003,2004
2,10005,2002-12-29,0.473976,129,853,190,1172,0.678947,0.110068,2003,2004,2005
3,10005,2003-12-28,0.46821,148,897,229,1274,0.646288,0.11617,2004,2005,2006
4,10005,2005-01-02,0.481028,165,887,213,1265,0.774648,0.130435,2006,2007,2008


In [55]:
all_data1 = pd.merge(new_df, filing_data[['gvkey', 'year', 'sentiment_score', 'pos_neg', 'pos_rat']], left_on = ['gvkey', 'fyear'], right_on = ['gvkey', 'year'], suffixes = ['', '_1'])
all_data2 = pd.merge(all_data1, filing_data[['gvkey', 'year_2', 'sentiment_score', 'pos_neg', 'pos_rat']], left_on = ['gvkey', 'fyear'], right_on = ['gvkey', 'year_2'], suffixes = ['', '_2'])
all_data = pd.merge(all_data2, filing_data[['gvkey', 'year_3', 'sentiment_score', 'pos_neg', 'pos_rat']], left_on = ['gvkey', 'fyear'], right_on = ['gvkey', 'year_3'], suffixes = ['', '_3'])

In [56]:
all_data = all_data.drop(['year', 'year_1', 'year_2', 'year_3'], axis = 1)
all_data['fallen_angel'] = all_data.fallen_angel.apply(lambda x : int(x))

all_data.head()

Unnamed: 0,gvkey,datadate,fyear,tic,ajex,curcd,fyr,act,at,bkvlps,...,roaps,sentiment_score,pos_neg,pos_rat,sentiment_score_2,pos_neg_2,pos_rat_2,sentiment_score_3,pos_neg_3,pos_rat_3
0,1380,2003/12/31,2003.0,HES,3.0,USD,12.0,3186.0,13983.0,51.9095,...,0.000512,0.487981,0.642857,0.043269,0.471088,0.346154,0.030612,0.49031,0.736842,0.054264
1,1487,2003/12/31,2003.0,AIG,0.05,USD,12.0,2038.046,678346.0,27.3163,...,5e-06,0.480617,0.582938,0.054185,0.482171,0.584337,0.050129,0.491009,0.743802,0.052204
2,1663,2003/12/31,2003.0,BUD.2,1.0,USD,12.0,1630.3,14689.5,3.335,...,0.000174,0.529963,2.777778,0.093633,0.544393,4.8,0.11215,0.53617,3.833333,0.097872
3,1976,2003/12/31,2003.0,BHI,1.0,USD,12.0,2523.9,6302.2,10.0916,...,6.2e-05,0.460949,0.43956,0.061256,0.485487,0.737288,0.081461,0.484199,0.717172,0.080135
4,2002,2003/12/31,2003.0,BPOP,0.2,USD,12.0,2038.046,36434.715,19.3205,...,9.7e-05,0.475057,0.241379,0.015873,0.478856,0.26087,0.014925,0.477961,0.238095,0.013774


In [57]:
all_data.columns

Index(['gvkey', 'datadate', 'fyear', 'tic', 'ajex', 'curcd', 'fyr', 'act',
       'at', 'bkvlps', 'capx', 'ceq', 'ch', 'cogs', 'csho', 'dltt', 'dt', 'dv',
       'ebit', 'ebitda', 'invt', 'lct', 'ni', 'opeps', 're', 'revt', 'wcap',
       'xacc', 'xad', 'xrd', 'exchg', 'prcc_f', 'ggroup', 'fallen_angel',
       'csho_adj', 'pm', 'atr', 'fl', 'cr', 'de', 'roa', 'actps', 'atps',
       'lctps', 'capxps', 'ceqps', 'chps', 'cogsps', 'dlttps', 'dtps', 'dvps',
       'ebitps', 'ebitdaps', 'invtps', 'nips', 'reps', 'revtps', 'wcapps',
       'xaccps', 'xadps', 'xrdps', 'pmps', 'atrps', 'flps', 'crps', 'deps',
       'roaps', 'sentiment_score', 'pos_neg', 'pos_rat', 'sentiment_score_2',
       'pos_neg_2', 'pos_rat_2', 'sentiment_score_3', 'pos_neg_3',
       'pos_rat_3'],
      dtype='object')

In [58]:
len(all_data)

4979

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

**Training Data From 2000-2010 with Fallen Angel Oversampling, Testing Data 2010-**

In [60]:
training_data = all_data[all_data['fyear'] <= 2010]
training_angels = training_data[training_data['fallen_angel'] == 1]
training_data = training_data[training_data['fallen_angel'] == 0]
training_data = training_data.sample(n = len(training_angels), random_state = 43).append(training_angels) #43

# for i in range(20):
#     training_data = training_data.append(training_angels)

testing_data = all_data[all_data['fyear'] > 2010]

In [61]:
training_data.fallen_angel.value_counts()

0    127
1    127
Name: fallen_angel, dtype: int64

**Fitting Models to Financial Data**

In [62]:
fin_model_cols_ps = ['actps', 'atps', 'bkvlps', 'capxps', 'ceqps', 'chps', 'cogsps', 'dlttps', 'dtps', 'dvps',
                   'ebitps', 'ebitdaps', 'invtps', 'lctps', 'nips', 'opeps', 'reps', 'revtps', 'wcapps',
                   'xaccps', 'xadps', 'xrdps', 'csho_adj', 'pmps', 'atrps',
                   'flps', 'crps', 'deps', 'roaps']


In [63]:
fin_model_cols = ['act', 'at', 'bkvlps', 'capx', 'ceq', 'ch', 'cogs', 'dltt', 'dt', 'dv',
                   'ebit', 'ebitda', 'invt', 'lct', 'ni', 'opeps', 're', 'revt', 'wcap',
                   'xacc', 'xad', 'xrd', 'csho_adj', 'pm', 'atr',
                   'fl', 'cr', 'de', 'roa']

lr_model_1 = LogisticRegression().fit(training_data[fin_model_cols_ps], training_data['fallen_angel'])

In [64]:
lr_model_1.score(testing_data[fin_model_cols_ps], testing_data['fallen_angel'])

0.7783350050150452

In [65]:
recall_score(testing_data['fallen_angel'], lr_model_1.predict(testing_data[fin_model_cols_ps]))

0.64

In [66]:
confusion_matrix(testing_data['fallen_angel'], lr_model_1.predict(testing_data[fin_model_cols_ps]))

array([[1520,  424],
       [  18,   32]], dtype=int64)

In [67]:
rf_model_1 = RandomForestClassifier().fit(training_data[fin_model_cols_ps], training_data['fallen_angel'])

In [68]:
rf_model_1.score(testing_data[fin_model_cols_ps], testing_data['fallen_angel'])

0.7382146439317954

In [69]:
recall_score(testing_data['fallen_angel'], np.where(rf_model_1.predict_proba(testing_data[fin_model_cols_ps])[:,1] > .45, 1, 0))

0.78

In [70]:
confusion_matrix(testing_data['fallen_angel'], np.where(rf_model_1.predict_proba(testing_data[fin_model_cols_ps])[:,1] > .45, 1, 0))

array([[1276,  668],
       [  11,   39]], dtype=int64)

In [71]:
fin_results = rf_model_1.predict_proba(testing_data[fin_model_cols_ps])[:, 0]

**Fitting Models to Textual Data**

In [72]:
text_model_cols = ['sentiment_score', 'pos_neg', 'pos_rat', 'sentiment_score_2',
                   'pos_neg_2', 'pos_rat_2', 'sentiment_score_3', 'pos_neg_3',
                   'pos_rat_3']

lr_model_2 = LogisticRegression().fit(training_data[text_model_cols], training_data['fallen_angel'])

In [73]:
lr_model_2.score(testing_data[text_model_cols], testing_data['fallen_angel'])

0.6850551654964895

In [74]:
recall_score(testing_data['fallen_angel'], lr_model_2.predict(testing_data[text_model_cols]))

0.32

In [75]:
confusion_matrix(testing_data['fallen_angel'], lr_model_2.predict(testing_data[text_model_cols]))

array([[1350,  594],
       [  34,   16]], dtype=int64)

In [76]:
rf_model_2 = RandomForestClassifier().fit(training_data[text_model_cols], training_data['fallen_angel'])

In [77]:
rf_model_2.score(testing_data[text_model_cols], testing_data['fallen_angel'])

0.5897693079237714

In [78]:
recall_score(testing_data['fallen_angel'], rf_model_2.predict(testing_data[text_model_cols]))

0.3

In [79]:
confusion_matrix(testing_data['fallen_angel'], np.where(rf_model_2.predict_proba(testing_data[text_model_cols])[:, 1] > .3, 1, 0))

array([[ 388, 1556],
       [   9,   41]], dtype=int64)

In [80]:
text_results = rf_model_2.predict_proba(testing_data[text_model_cols])[:, 0]

**Fitting Models to All Data**

In [81]:
fintext_model_cols = fin_model_cols_ps + text_model_cols

lr_model_3 = LogisticRegression().fit(training_data[fintext_model_cols], training_data['fallen_angel'])

In [82]:
lr_model_3.score(testing_data[fintext_model_cols], testing_data['fallen_angel'])

0.7477432296890673

In [83]:
recall_score(testing_data['fallen_angel'], lr_model_3.predict(testing_data[fintext_model_cols]))

0.66

In [84]:
confusion_matrix(testing_data['fallen_angel'], lr_model_3.predict(testing_data[fintext_model_cols]))

array([[1458,  486],
       [  17,   33]], dtype=int64)

In [85]:
rf_model_3 = RandomForestClassifier().fit(training_data[fintext_model_cols], training_data['fallen_angel'])

In [86]:
rf_model_3.score(testing_data[fintext_model_cols], testing_data['fallen_angel'])

0.7392176529588766

In [87]:
rf_model_3_output = np.where(rf_model_3.predict_proba(testing_data[fintext_model_cols])[:,1] > .35, 1, 0)
recall_score(testing_data['fallen_angel'], rf_model_3_output)

0.9

In [88]:
confusion_matrix(testing_data['fallen_angel'], rf_model_3_output)

array([[ 877, 1067],
       [   5,   45]], dtype=int64)

In [89]:
fintext_results = rf_model_3.predict_proba(testing_data[fintext_model_cols])[:, 0]

In [90]:
testing_data.columns

Index(['gvkey', 'datadate', 'fyear', 'tic', 'ajex', 'curcd', 'fyr', 'act',
       'at', 'bkvlps', 'capx', 'ceq', 'ch', 'cogs', 'csho', 'dltt', 'dt', 'dv',
       'ebit', 'ebitda', 'invt', 'lct', 'ni', 'opeps', 're', 'revt', 'wcap',
       'xacc', 'xad', 'xrd', 'exchg', 'prcc_f', 'ggroup', 'fallen_angel',
       'csho_adj', 'pm', 'atr', 'fl', 'cr', 'de', 'roa', 'actps', 'atps',
       'lctps', 'capxps', 'ceqps', 'chps', 'cogsps', 'dlttps', 'dtps', 'dvps',
       'ebitps', 'ebitdaps', 'invtps', 'nips', 'reps', 'revtps', 'wcapps',
       'xaccps', 'xadps', 'xrdps', 'pmps', 'atrps', 'flps', 'crps', 'deps',
       'roaps', 'sentiment_score', 'pos_neg', 'pos_rat', 'sentiment_score_2',
       'pos_neg_2', 'pos_rat_2', 'sentiment_score_3', 'pos_neg_3',
       'pos_rat_3'],
      dtype='object')

**Creating Portfolios from the Models**

In [91]:
port_data = pd.read_csv('MonthlyStockPrice.csv')
port_data = port_data.drop(columns='iid')
port_data['prcc_o'] = port_data.groupby('gvkey')['prccm'].shift(1)
port_data = port_data.rename(columns={'prccm':'prcc_f'})
port_data['datadate'] = pd.to_datetime(port_data['datadate'],format='%m/%d/%Y')
port_data['fyear'] = port_data['datadate'].dt.year
port_data = port_data[(port_data['fyear'] >= 2011) & (port_data['fyear'] <= 2016)]

FileNotFoundError: [Errno 2] No such file or directory: 'MonthlyStockPrice.csv'

In [None]:
port_data

In [None]:
testing_data['fin_results'] = fin_results
testing_data['text_results'] = text_results
testing_data['fintext_results'] = fintext_results
 

In [None]:
port_data_final = pd.merge(testing_data[['gvkey', 'fyear', 'fin_results', 'text_results', 'fintext_results']], port_data, on = ['gvkey', 'fyear'])

In [None]:
port_data_final

In [None]:
port_data_final.gvkey.value_counts()

**Fama-French Data**

In [None]:
ff = pd.read_csv('FFannual.csv')
ff = ff[(ff['Year'] >= 2011) & (ff['Year'] <= 2016)]
ff

**Portfolio from Financial Data**

In [None]:
port_fin_sorted = pd.DataFrame()
for i in range(int(port_data_final.fyear.min()), int(port_data_final.fyear.max() + 1)):
    yearly_sorted = port_data_final[port_data_final['fyear'] == i].sort_values(by = 'fin_results', ascending = False)
    port_fin_sorted = port_fin_sorted.append(yearly_sorted)

port_fin_sorted = port_fin_sorted[['gvkey','datadate' ,'fyear', 'fin_results', 'prcc_f', 'prcc_o']]

In [None]:
port_fin_sorted.drop_duplicates(subset = ['gvkey', 'datadate'], inplace = True)
port_fin_sorted = pd.merge(port_fin_sorted, ff,left_on=port_fin_sorted.fyear, right_on=ff.Year)
port_fin_sorted = port_fin_sorted[['gvkey','datadate','fyear','fin_results','prcc_f','prcc_o','Mkt-RF','SMB','HML','RF']]
port_fin_sorted['pct_change'] = (port_fin_sorted['prcc_f'] -  port_fin_sorted['prcc_o'])/port_fin_sorted['prcc_o']
port_fin_sorted = port_fin_sorted.dropna(how='any')

In [None]:
fin_port_returns = []

for i in range(2011,2016):
    yearly = port_fin_sorted[port_fin_sorted['fyear'] == i].groupby('gvkey')['pct_change'].mean()
    top_gvkeys = port_fin_sorted[port_fin_sorted['fyear'] == i]['gvkey'].unique()[:5]
    bot_gvkeys = port_fin_sorted[port_fin_sorted['fyear'] == i]['gvkey'].unique()[-5:]
    cur_year_top = []
    cur_year_bot = []  
    
    for x in range(len(yearly)):
        for y in range(len(top_gvkeys)):
            if top_gvkeys[y] == yearly.index[x]:
                top = top_gvkeys[y]
                cur_year_top.append(port_fin_sorted[(port_fin_sorted['gvkey'] == top) & (port_fin_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_top_ret = np.mean(cur_year_top)
            if bot_gvkeys[y] == yearly.index[x]:
                bot = bot_gvkeys[y]
                cur_year_bot.append(port_fin_sorted[(port_fin_sorted['gvkey'] == bot) & (port_fin_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_bot_ret = np.mean(cur_year_bot)
    
    fin_port_returns.append(cur_year_top_ret-cur_year_bot_ret)

  
print('2011 returns: ',fin_port_returns[0],'\n','2012 returns: ', fin_port_returns[1],'\n','2013 returns: ', fin_port_returns[2], '\n','2014 returns: ', fin_port_returns[3], '\n','2015 returns: ', fin_port_returns[4], '\n')

**Portfolio from the Text Data**

In [None]:
port_text_sorted = pd.DataFrame()
for i in range(int(port_data_final.fyear.min()), int(port_data_final.fyear.max() + 1)):
    yearly_sorted = port_data_final[port_data_final['fyear'] == i].sort_values(by = 'text_results', ascending = False)
    port_text_sorted = port_text_sorted.append(yearly_sorted)

port_text_sorted = port_text_sorted[['gvkey','datadate' ,'fyear', 'text_results', 'prcc_f', 'prcc_o']]

In [None]:
port_text_sorted.drop_duplicates(subset = ['gvkey', 'datadate'], inplace = True)
port_text_sorted = pd.merge(port_text_sorted, ff, left_on=port_text_sorted.fyear, right_on=ff.Year)
port_text_sorted = port_text_sorted[['gvkey','datadate','fyear','text_results','prcc_f','prcc_o','Mkt-RF','SMB','HML','RF']]
port_text_sorted['pct_change'] = (port_text_sorted['prcc_f'] -  port_text_sorted['prcc_o'])/port_text_sorted['prcc_o']
port_text_sorted = port_text_sorted.dropna(how='any')

In [None]:
text_port_returns = []

for i in range(2011,2016):
    yearly = port_text_sorted[port_text_sorted['fyear'] == i].groupby('gvkey')['pct_change'].mean()
    top_gvkeys = port_text_sorted[port_text_sorted['fyear'] == i]['gvkey'].unique()[:5]
    bot_gvkeys = port_text_sorted[port_text_sorted['fyear'] == i]['gvkey'].unique()[-5:]
    cur_year_top = []
    cur_year_bot = []  
    
    for x in range(len(yearly)):
        for y in range(len(top_gvkeys)):
            if top_gvkeys[y] == yearly.index[x]:
                top = top_gvkeys[y]
                cur_year_top.append(port_text_sorted[(port_text_sorted['gvkey'] == top) & (port_text_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_top_ret = np.mean(cur_year_top)
            if bot_gvkeys[y] == yearly.index[x]:
                bot = bot_gvkeys[y]
                cur_year_bot.append(port_text_sorted[(port_text_sorted['gvkey'] == bot) & (port_text_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_bot_ret = np.mean(cur_year_bot)
    
    text_port_returns.append(cur_year_top_ret-cur_year_bot_ret)

  
print('2011 returns: ',text_port_returns[0],'\n','2012 returns: ', text_port_returns[1],'\n','2013 returns: ', text_port_returns[2], '\n','2014 returns: ', text_port_returns[3], '\n','2015 returns: ', text_port_returns[4], '\n')

**Portfolio from the Financial & Text Data**

In [None]:
port_fintext_sorted = pd.DataFrame()
for i in range(int(port_data_final.fyear.min()), int(port_data_final.fyear.max() + 1)):
    yearly_sorted = port_data_final[port_data_final['fyear'] == i].sort_values(by = 'fintext_results', ascending = False)
    port_fintext_sorted = port_fintext_sorted.append(yearly_sorted)

port_fintext_sorted = port_fintext_sorted[['gvkey','datadate', 'fyear', 'fintext_results', 'prcc_f', 'prcc_o']]

In [None]:
port_fintext_sorted.drop_duplicates(subset = ['gvkey', 'datadate'], inplace = True)
port_fintext_sorted = pd.merge(port_fintext_sorted, ff,left_on=port_fintext_sorted.fyear, right_on=ff.Year)
port_fintext_sorted = port_fintext_sorted[['gvkey','datadate','fyear','fintext_results','prcc_f','prcc_o','Mkt-RF','SMB','HML','RF']]
port_fintext_sorted['pct_change'] = (port_fintext_sorted['prcc_f'] -  port_fintext_sorted['prcc_o'])/port_fintext_sorted['prcc_o']
port_fintext_sorted = port_fintext_sorted.dropna(how='any')

In [None]:
fintext_port_returns = []

for i in range(2011,2016):
    yearly = port_fintext_sorted[port_fintext_sorted['fyear'] == i].groupby('gvkey')['pct_change'].mean()
    top_gvkeys = port_fintext_sorted[port_fintext_sorted['fyear'] == i]['gvkey'].unique()[:5]
    bot_gvkeys = port_fintext_sorted[port_fintext_sorted['fyear'] == i]['gvkey'].unique()[-5:]
    cur_year_top = []
    cur_year_bot = []  
    
    for x in range(len(yearly)):
        for y in range(len(top_gvkeys)):
            if top_gvkeys[y] == yearly.index[x]:
                top = top_gvkeys[y]
                cur_year_top.append(port_fintext_sorted[(port_fintext_sorted['gvkey'] == top) & (port_fintext_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_top_ret = np.mean(cur_year_top)
            if bot_gvkeys[y] == yearly.index[x]:
                bot = bot_gvkeys[y]
                cur_year_bot.append(port_fintext_sorted[(port_fintext_sorted['gvkey'] == bot) & (port_fintext_sorted['fyear']==i)]['pct_change'].mean())
                cur_year_bot_ret = np.mean(cur_year_bot)
    
    fintext_port_returns.append(cur_year_top_ret-cur_year_bot_ret)

   
print('2011 returns: ',fintext_port_returns[0],'\n','2012 returns: ', fintext_port_returns[1],'\n','2013 returns: ', fintext_port_returns[2], '\n','2014 returns: ', fintext_port_returns[3], '\n','2015 returns: ', fintext_port_returns[4], '\n')