# Conditioning on Analyst Surprise

In [165]:
import pandas as pd
import numpy as np
# import stats as sts 
import matplotlib.pyplot as plt

In [166]:
%matplotlib inline

In [167]:
#read analyst actuals file
column_names1 = ['TICKER','ANNDATS','quarternum','earn_value']
df_actual_ern=pd.read_csv("../AFE_data/df_actual_ern.csv", header = None, names = column_names1)
#df_actual_ern = pd.read_csv('/Users/Blair/Downloads/AFE_data/df_actual_ern.csv')
#read analyst forecast file
column_names2 = ['TICKER','ANNDATS','quarternum','analyst','forecast_value']
df_aforcast=pd.read_csv("../AFE_data/df_aforecast.csv", header = None, names = column_names2)
#df_aforecast = pd.read_csv('/Users/Blair/Downloads/AFE_data/df_aforecast.csv')

In [168]:
df_aforcast.head()

Unnamed: 0,TICKER,ANNDATS,quarternum,analyst,forecast_value
0,AA,20110331,0,73367,0.57
1,AA,20110630,1,130416,1.08
2,AA,20111231,3,146094,0.51
3,AA,20111231,3,146094,0.21
4,AA,20120331,0,48907,-0.03


In [169]:
#Set Year
df_actual_ern['year'] = df_actual_ern['ANNDATS'].apply(lambda x: str(x)[:4])

df_aforcast['year'] = df_aforcast['ANNDATS'].apply(lambda x: str(x)[:4])

In [170]:
#Merge two datasets
merged = pd.merge(df_aforcast,df_actual_ern,how = 'inner', on = ['TICKER','quarternum','year'])
merged

Unnamed: 0,TICKER,ANNDATS_x,quarternum,analyst,forecast_value,year,ANNDATS_y,earn_value
0,AA,20110331,0,73367,0.570,2011,20110110,0.63
1,AA,20110630,1,130416,1.080,2011,20110411,0.84
2,AA,20111231,3,146094,0.510,2011,20111011,0.42
3,AA,20111231,3,146094,0.210,2011,20111011,0.42
4,AA,20120331,0,48907,-0.030,2012,20120109,-0.09
5,AA,20120630,1,48907,0.600,2012,20120410,0.30
6,AA,20120930,2,73867,0.030,2012,20120709,0.18
7,AA,20120930,2,49831,0.000,2012,20120709,0.18
8,AA,20121231,3,49831,0.540,2012,20121009,0.09
9,AA,20130331,0,73867,0.180,2013,20130108,0.18


## Surprise Matrics

In [171]:
#Surprise Matrics

merged.index = merged[['TICKER', 'year','quarternum']]

df=merged.reset_index()
merged['meadian']=df.groupby('index').forecast_value.median()
merged['mean']=df.groupby('index').forecast_value.mean()
merged['std']=df.groupby('index').forecast_value.std()
merged['delta_mm']=merged['meadian']-merged['mean']

In [172]:
merged.head()

Unnamed: 0,TICKER,ANNDATS_x,quarternum,analyst,forecast_value,year,ANNDATS_y,earn_value,meadian,mean,std,delta_mm
"(AA, 2011, 0)",AA,20110331,0,73367,0.57,2011,20110110,0.63,0.57,0.57,,0.0
"(AA, 2011, 1)",AA,20110630,1,130416,1.08,2011,20110411,0.84,1.08,1.08,,0.0
"(AA, 2011, 3)",AA,20111231,3,146094,0.51,2011,20111011,0.42,0.36,0.36,0.212132,0.0
"(AA, 2011, 3)",AA,20111231,3,146094,0.21,2011,20111011,0.42,0.36,0.36,0.212132,0.0
"(AA, 2012, 0)",AA,20120331,0,48907,-0.03,2012,20120109,-0.09,-0.03,-0.03,,0.0


In [173]:
#Median forecast - actual
merged['surprise_median_fa']=(merged['meadian']-merged['earn_value'])/merged['std']
#Mean forecast - actual
merged['surprise_mean_fa']=(merged['mean']-merged['earn_value'])/merged['std']

## Distribution metrics

In [174]:
#the skewness of the distribution:median_mean
merged['skew']=merged['delta_mm']/merged['delta_mm'].std()
#the kurtosis of the distribution
df=merged.reset_index()
merged['kurtosis']=df.groupby('index').forecast_value.median()

In [175]:
merged.head()

Unnamed: 0,TICKER,ANNDATS_x,quarternum,analyst,forecast_value,year,ANNDATS_y,earn_value,meadian,mean,std,delta_mm,surprise_median_fa,surprise_mean_fa,skew,kurtosis
"(AA, 2011, 0)",AA,20110331,0,73367,0.57,2011,20110110,0.63,0.57,0.57,,0.0,,,0.0,0.57
"(AA, 2011, 1)",AA,20110630,1,130416,1.08,2011,20110411,0.84,1.08,1.08,,0.0,,,0.0,1.08
"(AA, 2011, 3)",AA,20111231,3,146094,0.51,2011,20111011,0.42,0.36,0.36,0.212132,0.0,-0.282843,-0.282843,0.0,0.36
"(AA, 2011, 3)",AA,20111231,3,146094,0.21,2011,20111011,0.42,0.36,0.36,0.212132,0.0,-0.282843,-0.282843,0.0,0.36
"(AA, 2012, 0)",AA,20120331,0,48907,-0.03,2012,20120109,-0.09,-0.03,-0.03,,0.0,,,0.0,-0.03


In [176]:
merged['kurtosis'].describe()

count    11481.000000
mean         0.761401
std          1.359862
min         -5.703750
25%          0.282500
50%          0.595000
75%          1.045000
max         28.350000
Name: kurtosis, dtype: float64

In [177]:
# Windows specific causes issues on linux

# merged['kurtosis'].hist(bins=100)
# plt.axvline(3, color='r')
# plt.text(15, 8000, 'leptokurtic')
# plt.text(-5, 14000,'platykurtic')

In [178]:
merged['skew'].describe()

count    11481.000000
mean         0.003659
std          1.000000
min        -18.607740
25%         -0.018385
50%          0.000000
75%          0.025126
max         35.376313
Name: skew, dtype: float64

### How do your pre-post earnings price time series from the previous section look as a function of conditioning on different values of the surprise metrics (and distribution metrics) above?

In [179]:
#This part need to combine with previous work

# Train Predictive Models

## Stylized surprise metrics dataset

### You will use as input the different metrics of the analyst distribution and its surprise metrics that you calculated for every earnings event in the previous section. As output variable you will use a categorical variable y which is 1 every time the 10-day post-earnings stock return is in the top 10% or bottom 10% for the given quarter.

In [180]:
#read daily price file
df_prc_dist=pd.read_csv("../AFE_data/df_prc_dist.csv")
#set year
df_prc_dist['year'] = df_prc_dist['ANNDATS'].apply(lambda x: str(x)[:4])


In [181]:
#Build the new dataset to calculate the 10 days return
df_prc_dist['delta_return']=df_prc_dist['10']-df_prc_dist['0']
df_prc_dist1=df_prc_dist[['delta_return','TICKER','quarternum','year']]
df_prc_dist1.head()

Unnamed: 0,delta_return,TICKER,quarternum,year
0,-0.0181,ABBV,1,2013
1,-0.0018,ABBV,2,2013
2,-0.0258,ABBV,3,2013
3,0.0305,ABBV,0,2014
4,0.0635,ABBV,1,2014


In [182]:
#Build stylized surprise metrics dataset
suprised_df = pd.DataFrame()
suprised_df['surprise_median_fa']=merged['surprise_median_fa']
suprised_df['surprise_mean_fa']=merged['surprise_mean_fa']
suprised_df['TICKER']=merged['TICKER']
suprised_df['year']=merged['year']
suprised_df['quarternum']=merged['quarternum']
suprised_df['Skew']=merged['skew']
suprised_df['Kurtosis']=merged['kurtosis']
suprised_df1 = pd.merge(suprised_df,df_prc_dist1,how = 'inner', on = ['TICKER','quarternum','year'])
suprised_df1.index = suprised_df1[['TICKER', 'year','quarternum']]


In [183]:
suprised_df1_q0=suprised_df1.loc[suprised_df1['quarternum'] == 0]
suprised_df1_q1=suprised_df1.loc[suprised_df1['quarternum'] == 1]
suprised_df1_q2=suprised_df1.loc[suprised_df1['quarternum'] == 2]
suprised_df1_q3=suprised_df1.loc[suprised_df1['quarternum'] == 3]

In [184]:
suprised_df1_q0=suprised_df1_q0.sort_values(["delta_return"],ascending=True)
bottom10_q0=suprised_df1_q0["delta_return"].quantile(0.1)
top10_q0=suprised_df1_q0["delta_return"].quantile(0.9)


In [185]:
print(bottom10_q0)
print(top10_q0)

-0.045799999999999993
0.09916000000000001


In [186]:
suprised_df1_q0['Y'] = np.where((suprised_df1_q0['delta_return']>=70.37)|(suprised_df1_q0['delta_return']<=15.73), '1', '0')

In [187]:
suprised_df1_q1=suprised_df1_q1.sort_values(["delta_return"],ascending=True)
bottom10_q1=suprised_df1_q1["delta_return"].quantile(0.1)
top10_q1=suprised_df1_q1["delta_return"].quantile(0.9)


In [188]:
print(bottom10_q1)
print(top10_q1)

-0.04349999999999998
0.06806000000000004


In [189]:
suprised_df1_q1['Y'] = np.where((suprised_df1_q1['delta_return']>=80.315)|(suprised_df1_q1['delta_return']<=20.875), '1', '0')


In [190]:
suprised_df1_q2=suprised_df1_q2.sort_values(["delta_return"],ascending=True)
bottom10_q2=suprised_df1_q2["delta_return"].quantile(0.1)
top10_q2=suprised_df1_q2["delta_return"].quantile(0.9)

In [191]:
print(bottom10_q2)
print(top10_q2)

-0.12009999999999998
0.05489999999999995


In [192]:
suprised_df1_q2['Y'] = np.where((suprised_df1_q2['delta_return']>=106.16)|(suprised_df1_q2['delta_return']<=19.86), '1', '0')

In [193]:
suprised_df1_q3=suprised_df1_q3.sort_values(["delta_return"],ascending=True)
bottom10_q3=suprised_df1_q3["delta_return"].quantile(0.1)
top10_q3=suprised_df1_q3["delta_return"].quantile(0.9)

In [194]:
print(bottom10_q3)
print(top10_q3)

-0.04590000000000005
0.08258000000000018


In [195]:
suprised_df1_q3['Y'] = np.where((suprised_df1_q3['delta_return']>=98.18)|(suprised_df1_q3['delta_return']<=16.14), '1', '0')


In [196]:
suprised_df1_new = pd.concat([suprised_df1_q0, suprised_df1_q1, suprised_df1_q2, suprised_df1_q3], axis=0)
suprised_df1_new.head()

Unnamed: 0,surprise_median_fa,surprise_mean_fa,TICKER,year,quarternum,Skew,Kurtosis,delta_return,Y
"(MSFT, 2015, 0)",-4.062651,-4.007161,MSFT,2015,0,-0.012011,0.545,-0.0989,1
"(MSFT, 2015, 0)",-4.062651,-4.007161,MSFT,2015,0,-0.012011,0.545,-0.0989,1
"(MSFT, 2015, 0)",-4.062651,-4.007161,MSFT,2015,0,-0.012011,0.545,-0.0989,1
"(MSFT, 2015, 0)",-4.062651,-4.007161,MSFT,2015,0,-0.012011,0.545,-0.0989,1
"(MSFT, 2015, 0)",-4.062651,-4.007161,MSFT,2015,0,-0.012011,0.545,-0.0989,1


## Full distribution dataset. 

○	Your input features are 
■	The relative proportions of number analysts in each of the 10 quantiles of the forecast distribution. The sum of these proportions should be 1
■	The total number of analysts in this distribution
■	The standard deviation of this distribution
■	The full range of the distribution, e.g the max_forecast-min_forecast
■	The actual earning announcement value
○	The output feature y is again 1 every time the 10-day post-earnings stock return is in the top 10% or bottom 10% of the given quarter.


In [197]:
superised_df2=merged

In [198]:
merged

Unnamed: 0,TICKER,ANNDATS_x,quarternum,analyst,forecast_value,year,ANNDATS_y,earn_value,meadian,mean,std,delta_mm,surprise_median_fa,surprise_mean_fa,skew,kurtosis
"(AA, 2011, 0)",AA,20110331,0,73367,0.570,2011,20110110,0.63,0.570,0.570000,,0.000000,,,0.000000,0.570
"(AA, 2011, 1)",AA,20110630,1,130416,1.080,2011,20110411,0.84,1.080,1.080000,,0.000000,,,0.000000,1.080
"(AA, 2011, 3)",AA,20111231,3,146094,0.510,2011,20111011,0.42,0.360,0.360000,0.212132,0.000000,-0.282843,-0.282843,0.000000,0.360
"(AA, 2011, 3)",AA,20111231,3,146094,0.210,2011,20111011,0.42,0.360,0.360000,0.212132,0.000000,-0.282843,-0.282843,0.000000,0.360
"(AA, 2012, 0)",AA,20120331,0,48907,-0.030,2012,20120109,-0.09,-0.030,-0.030000,,0.000000,,,0.000000,-0.030
"(AA, 2012, 1)",AA,20120630,1,48907,0.600,2012,20120410,0.30,0.600,0.600000,,0.000000,,,0.000000,0.600
"(AA, 2012, 2)",AA,20120930,2,73867,0.030,2012,20120709,0.18,0.015,0.015000,0.021213,0.000000,-7.778175,-7.778175,0.000000,0.015
"(AA, 2012, 2)",AA,20120930,2,49831,0.000,2012,20120709,0.18,0.015,0.015000,0.021213,0.000000,-7.778175,-7.778175,0.000000,0.015
"(AA, 2012, 3)",AA,20121231,3,49831,0.540,2012,20121009,0.09,0.540,0.540000,,0.000000,,,0.000000,0.540
"(AA, 2013, 0)",AA,20130331,0,73867,0.180,2013,20130108,0.18,0.180,0.180000,,0.000000,,,0.000000,0.180


In [201]:
# del superised_df2['skew']
# del superised_df2['kurtosis']
# del superised_df2['surprise_median_fa']
# del superised_df2['surprise_mean_fa']
# del superised_df2['ANNDATS_y']
# del superised_df2['delta_mm']
# del superised_df2['meadian']
# del superised_df2['mean']
# #del superised_df2['std']
# del superised_df2['ANNDATS_x']

superised_df2 = superised_df2[['TICKER', 'quarternum', 'analyst', 'forecast_value', 'year',
       'earn_value', 'std']]


In [202]:
# The total number of analysts in this distribution
superised_df2['Analyst_Counts'] = superised_df2.groupby(superised_df2.index)['analyst'].transform('count')

In [203]:
# The total number of analysts in this distribution
superised_df2.head()

Unnamed: 0,TICKER,quarternum,analyst,forecast_value,year,earn_value,std,Analyst_Counts
"(AA, 2011, 0)",AA,0,73367,0.57,2011,0.63,,1
"(AA, 2011, 1)",AA,1,130416,1.08,2011,0.84,,1
"(AA, 2011, 3)",AA,3,146094,0.51,2011,0.42,0.212132,2
"(AA, 2011, 3)",AA,3,146094,0.21,2011,0.42,0.212132,2
"(AA, 2012, 0)",AA,0,48907,-0.03,2012,-0.09,,1


In [204]:
#The relative proportions of number analysts in each of the 10 quantiles of the forecast distribution. The sum of these proportions should be 1

#superised_df2["forecast_value"].quantile(0.1)


In [209]:
#The standard deviation of this distribution
superised_df2['std']=superised_df2.groupby(superised_df2.index).forecast_value.std()

In [210]:
#The full range of the distribution, e.g the max_forecast-min_forecast 

superised_df2_mm=superised_df2.reset_index()
superised_df2['Max']=superised_df2_mm.groupby('index').forecast_value.max()
superised_df2['Min']=superised_df2_mm.groupby('index').forecast_value.min()
superised_df2['the max_forecast-min_forecast']=(superised_df2['Max']-superised_df2['Min'])
#del superised_df2['Max']
#del superised_df2['Min']

In [227]:
unique_earnings_events = superised_df2[superised_df2.Analyst_Counts>1.][['TICKER','year','quarternum']].drop_duplicates().as_matrix()

In [377]:
df_outputs = []
j = 0.
for earnings_event in unique_earnings_events:
    print(j,len(unique_earnings_events))
    j+=1.
    
    filter_df = superised_df2[(superised_df2.TICKER==earnings_event[0]) & 
                 (superised_df2.year==earnings_event[1])& 
                 (superised_df2.quarternum==earnings_event[2])]

#     qs, bins = pd.qcut(filter_df.forecast_value,10,retbins=True)
    
    col = 'forecast_value'
    output = {}
    step=10

#     step=10
    for i in range(0,100,step):
        bins = [filter_df.forecast_value.quantile(i/100.),filter_df.forecast_value.quantile((i+step)/100.)]
        ct = filter_df[(filter_df.forecast_value>=bins[0]) & (filter_df.forecast_value<=bins[1])].shape[0]    
        output['quantile_forecast_value_count_' + str(i)] = float(ct)
        
        temp = pd.DataFrame([output],index=filter_df.index.unique())
        div = float(temp.sum(axis=1))
        
        temp = temp/div
        
        df_outputs.append(temp)


0.0 1988
1.0 1988
2.0 1988
3.0 1988
4.0 1988
5.0 1988
6.0 1988
7.0 1988
8.0 1988
9.0 1988
10.0 1988
11.0 1988
12.0 1988
13.0 1988
14.0 1988
15.0 1988
16.0 1988
17.0 1988
18.0 1988
19.0 1988
20.0 1988
21.0 1988
22.0 1988
23.0 1988
24.0 1988
25.0 1988
26.0 1988
27.0 1988
28.0 1988
29.0 1988
30.0 1988
31.0 1988
32.0 1988
33.0 1988
34.0 1988
35.0 1988
36.0 1988
37.0 1988
38.0 1988
39.0 1988
40.0 1988
41.0 1988
42.0 1988
43.0 1988
44.0 1988
45.0 1988
46.0 1988
47.0 1988
48.0 1988
49.0 1988
50.0 1988
51.0 1988
52.0 1988
53.0 1988
54.0 1988
55.0 1988
56.0 1988
57.0 1988
58.0 1988
59.0 1988
60.0 1988
61.0 1988
62.0 1988
63.0 1988
64.0 1988
65.0 1988
66.0 1988
67.0 1988
68.0 1988
69.0 1988
70.0 1988
71.0 1988
72.0 1988
73.0 1988
74.0 1988
75.0 1988
76.0 1988
77.0 1988
78.0 1988
79.0 1988
80.0 1988
81.0 1988
82.0 1988
83.0 1988
84.0 1988
85.0 1988
86.0 1988
87.0 1988
88.0 1988
89.0 1988
90.0 1988
91.0 1988
92.0 1988
93.0 1988
94.0 1988
95.0 1988
96.0 1988
97.0 1988
98.0 1988
99.0 1988
100.0 1988

756.0 1988
757.0 1988
758.0 1988
759.0 1988
760.0 1988
761.0 1988
762.0 1988
763.0 1988
764.0 1988
765.0 1988
766.0 1988
767.0 1988
768.0 1988
769.0 1988
770.0 1988
771.0 1988
772.0 1988
773.0 1988
774.0 1988
775.0 1988
776.0 1988
777.0 1988
778.0 1988
779.0 1988
780.0 1988
781.0 1988
782.0 1988
783.0 1988
784.0 1988
785.0 1988
786.0 1988
787.0 1988
788.0 1988
789.0 1988
790.0 1988
791.0 1988
792.0 1988
793.0 1988
794.0 1988
795.0 1988
796.0 1988
797.0 1988
798.0 1988
799.0 1988
800.0 1988
801.0 1988
802.0 1988
803.0 1988
804.0 1988
805.0 1988
806.0 1988
807.0 1988
808.0 1988
809.0 1988
810.0 1988
811.0 1988
812.0 1988
813.0 1988
814.0 1988
815.0 1988
816.0 1988
817.0 1988
818.0 1988
819.0 1988
820.0 1988
821.0 1988
822.0 1988
823.0 1988
824.0 1988
825.0 1988
826.0 1988
827.0 1988
828.0 1988
829.0 1988
830.0 1988
831.0 1988
832.0 1988
833.0 1988
834.0 1988
835.0 1988
836.0 1988
837.0 1988
838.0 1988
839.0 1988
840.0 1988
841.0 1988
842.0 1988
843.0 1988
844.0 1988
845.0 1988
846.0 1988

1464.0 1988
1465.0 1988
1466.0 1988
1467.0 1988
1468.0 1988
1469.0 1988
1470.0 1988
1471.0 1988
1472.0 1988
1473.0 1988
1474.0 1988
1475.0 1988
1476.0 1988
1477.0 1988
1478.0 1988
1479.0 1988
1480.0 1988
1481.0 1988
1482.0 1988
1483.0 1988
1484.0 1988
1485.0 1988
1486.0 1988
1487.0 1988
1488.0 1988
1489.0 1988
1490.0 1988
1491.0 1988
1492.0 1988
1493.0 1988
1494.0 1988
1495.0 1988
1496.0 1988
1497.0 1988
1498.0 1988
1499.0 1988
1500.0 1988
1501.0 1988
1502.0 1988
1503.0 1988
1504.0 1988
1505.0 1988
1506.0 1988
1507.0 1988
1508.0 1988
1509.0 1988
1510.0 1988
1511.0 1988
1512.0 1988
1513.0 1988
1514.0 1988
1515.0 1988
1516.0 1988
1517.0 1988
1518.0 1988
1519.0 1988
1520.0 1988
1521.0 1988
1522.0 1988
1523.0 1988
1524.0 1988
1525.0 1988
1526.0 1988
1527.0 1988
1528.0 1988
1529.0 1988
1530.0 1988
1531.0 1988
1532.0 1988
1533.0 1988
1534.0 1988
1535.0 1988
1536.0 1988
1537.0 1988
1538.0 1988
1539.0 1988
1540.0 1988
1541.0 1988
1542.0 1988
1543.0 1988
1544.0 1988
1545.0 1988
1546.0 1988
1547

In [379]:
df_outputs = pd.concat(df_outputs)
df_outputs = df_outputs.fillna(0.)

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [383]:
superised_df2  = superised_df2.join(df_outputs)