In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [3]:
df = pd.read_csv("us2022q2a.csv")
df_additional = pd.read_csv("usfirms2022.csv", usecols=["Ticker", "Sector NAICS\nlevel 1", "Name"])
df = df.join(df_additional.set_index("Ticker"), on="firm")

In [4]:

# IV: Operating profit growth, book to market value, short debts, EPSP
df.dropna(thresh=8, inplace=True) #8 are the columns that the dataset keeps even if the organisation was not operational + 1
df['qdate'] = pd.PeriodIndex(df['q'], freq="Q")
df.set_index(['firm'], inplace=True)

In [5]:
#Operating profit growth
df['ebit'] = df["revenue"] - df["cogs"] - df["sgae"] - df["otheropexp"]
df['lebit'] = df.groupby(['firm'])['ebit'].shift(4)
lebit_tmp = df["lebit"].replace(0, np.nan)
df["operatingprofitgrowth"] = df["ebit"] / lebit_tmp

#book to maret value
df["bookvalue"] = df["totalassets"] - df["totalliabilities"]
df["marketvalue"] = df["originalprice"] * df["sharesoutstanding"]
bookvalue_tmp = df["bookvalue"].replace(0, np.nan)
df["booktomarketratio"] = df["marketvalue"] / bookvalue_tmp

#short debts
totalassets_tmp = df["totalassets"].replace(0, np.nan)
df["shortfinancialleverage"] = df["shortdebt"] / totalassets_tmp

#EPSP
df['netincome'] = df["revenue"] - df["cogs"] - df["sgae"] - df["otheropexp"] - df["incometax"] - df["finexp"] + df["extraincome"]
df["epsp"] = (df["netincome"] / df["sharesoutstanding"]) / df["originalprice"]

#F1
df['ladjprice'] = df.groupby(['firm'])['adjprice'].shift(4)
df["ccstockreturns"] = np.log(df["adjprice"]) - np.log(df["ladjprice"])
df["f1"] = df["ccstockreturns"].groupby("firm").shift(-1)

variables_interest = ["operatingprofitgrowth", "booktomarketratio", "shortfinancialleverage", "epsp", "f1"]
independent_variables = ["operatingprofitgrowth", "booktomarketratio", "shortfinancialleverage", "epsp"]
dependent_variables = ["f1"]

In [6]:
df = df.groupby("firm").tail(2).groupby("firm").head(1)
df

Unnamed: 0_level_0,q,revenue,cogs,sgae,otheropexp,extraincome,finexp,incometax,totalassets,totalliabilities,...,operatingprofitgrowth,bookvalue,marketvalue,booktomarketratio,shortfinancialleverage,netincome,epsp,ladjprice,ccstockreturns,f1
firm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,2022q1,1674000.0,764000.0,5.340000e+05,0.0,-37000.00000,20000.000,36000.0,1.032700e+07,5.173000e+06,...,1.146341,5154000.00,3.971400e+07,7.705472,0.000000,2.830000e+05,0.007126,126.034506,0.045405,-0.213296
AA,2022q1,3293000.0,2181000.0,2.130000e+05,125000.0,-70000.00000,25000.000,210000.0,1.598800e+07,9.731000e+06,...,2.345455,6257000.00,1.669183e+07,2.667706,0.000063,4.690000e+05,0.028098,32.262641,1.022496,0.217886
AAIC,2022q1,8470.0,4773.0,0.000000e+00,0.0,-4111.00000,0.000,2287.0,9.208830e+05,7.027860e+05,...,0.983768,218097.00,1.215069e+05,0.557123,,-2.701000e+03,-0.022229,4.040000,-0.152090,-0.222528
AAL,2022q1,8899000.0,0.0,1.062200e+07,0.0,92000.00000,455000.000,-451000.0,6.740100e+07,7.634100e+07,...,1.310266,-8940000.00,1.184717e+07,-1.325187,0.035341,-1.635000e+06,-0.138008,23.900000,-0.269713,-0.514447
AAME,2022q1,51608.0,0.0,4.781200e+04,0.0,0.00000,0.000,954.0,3.750310e+05,2.486080e+05,...,-6.939671,126423.00,6.378494e+04,0.504536,,2.842000e+03,0.044556,3.640795,-0.156671,-0.475675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZVIA,2022q1,38034.0,23413.0,2.327500e+04,8901.0,6669.00000,0.000,12.0,1.164800e+05,2.261900e+04,...,,93861.00,3.047556e+05,3.246882,0.005220,-1.089800e+04,-0.035760,,,
ZVO,2022q1,61633.0,39829.0,2.903600e+04,0.0,-127.00000,0.000,78.0,1.487510e+05,1.374840e+05,...,0.774553,11267.00,2.792841e+04,2.478779,0.000000,-7.437000e+03,-0.266288,4.060000,-1.599512,-0.982014
ZWS,2022q1,239600.0,137700.0,5.690000e+04,1100.0,1100.00000,4800.000,10000.0,1.118600e+06,9.249000e+05,...,0.558524,193700.00,4.452699e+06,22.987604,0.005006,3.020000e+04,0.006782,23.259723,0.416636,0.093173
ZY,2022q1,4791.0,12455.0,5.608200e+04,-130.0,-532.00000,7994.000,-26.0,6.181890e+05,2.741250e+05,...,0.762699,344064.00,2.980191e+05,0.866174,0.093572,-7.211600e+04,-0.241984,,,-3.482115


1. Realiza un análisis exploratorio de las variables:

1.1 Calcula matriz de varianza y covarianza, así como matriz de correlación de las variables independientes y la dependiente. Explicar qué es la varianza, covarianza y correlación. Interpreta la matriz de correlación. Tiene que utilizar álgebra matricial y corroborar resultados con funciones de Python. 

In [7]:
#Linear algebra version
df = df.reset_index()
variables = pd.DataFrame(data=df[variables_interest])
variables.replace(np.nan, 0, inplace=True)
x = np.matrix(variables)
xs = (1/variables.count()["f1"]) * x.T * x

m = variables.mean()
m = np.matrix(m).T
ms = m * m.T
ms

cov = pd.DataFrame(data= xs - ms)
cov.columns = variables_interest
cov.index = variables_interest
cov


Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,f1
operatingprofitgrowth,402.672329,-7.469732,-0.034091,0.006796,-0.566642
booktomarketratio,-7.469732,15021.561077,0.140447,0.044877,0.288722
shortfinancialleverage,-0.034091,0.140447,0.011593,-0.000923,-0.005615
epsp,0.006796,0.044877,-0.000923,0.014566,0.037084
f1,-0.566642,0.288722,-0.005615,0.037084,0.465175


In [8]:
#Inbuilt version
variables.cov()

Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,f1
operatingprofitgrowth,402.784028,-7.471804,-0.0341,0.006798,-0.566799
booktomarketratio,-7.471804,15025.727946,0.140486,0.04489,0.288802
shortfinancialleverage,-0.0341,0.140486,0.011597,-0.000923,-0.005617
epsp,0.006798,0.04489,-0.000923,0.01457,0.037094
f1,-0.566799,0.288802,-0.005617,0.037094,0.465304


In [9]:
#Linear algebra version
x = np.matrix(variables.std())
x.T * x
corr = cov / (x.T * x)
corr

Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,f1
operatingprofitgrowth,0.999723,-0.003036,-0.015774,0.002805,-0.041391
booktomarketratio,-0.003036,0.999723,0.01064,0.003033,0.003453
shortfinancialleverage,-0.015774,0.01064,0.999723,-0.070971,-0.076442
epsp,0.002805,0.003033,-0.070971,0.999723,0.450387
f1,-0.041391,0.003453,-0.076442,0.450387,0.999723


In [10]:
#Inbuilt version
variables.corr()

Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,f1
operatingprofitgrowth,1.0,-0.003037,-0.015778,0.002806,-0.041402
booktomarketratio,-0.003037,1.0,0.010643,0.003034,0.003454
shortfinancialleverage,-0.015778,0.010643,1.0,-0.070991,-0.076463
epsp,0.002806,0.003034,-0.070991,1.0,0.450512
f1,-0.041402,0.003454,-0.076463,0.450512,1.0


Varianza: Mide que tanto varían los datos del promedio. Se define matemáticamente por:
$$\small{\sigma^2 = \frac{\displaystyle\sum_{i=1}^{N} (x_i - \bar{x})^2}{N}} $$

Covarianza: Medida de cuanto coincide la desviación del promedio de una variable de la desviación del promedio de otra variable. Se define como:

$$\small{COV(X, Y) = \frac{\displaystyle\sum_{i=1}^{N} (x_i - \bar{x})(y_i - \bar{y})}{N}} $$

Correlación: Es la covarianza de dos variables normalizada (en una medida de [-1, 1]) por medio de la desviación estándar. Se define matematicamente como:
$$\small{\rho(X, Y) = \frac{COV(X, Y)}{SD(X)SD(Y)}} $$

1.2 Corre pruebas estadísticas para detectar outliers y leverage points. Tiene que utilizar álgebra matricial para las pruebas y explicar claramente cómo funcionan las pruebas. Puede utilizar funciones de Python para corroborar resultados.

In [11]:
variables = pd.DataFrame(data=df[independent_variables])
variables.replace(np.nan, 0, inplace=True)
ones_vector = np.ones((variables.count()["operatingprofitgrowth"], 1))

In [12]:
#Linear algebra version leverages

x = np.matrix(variables)
x = np.c_[ones_vector, x]
h = x * np.linalg.inv(x.T * x) * x.T

leverages = pd.DataFrame(data={"Leverage": np.diag(h)})
leverages = pd.concat([variables, leverages], axis=1, join="inner")

threshold_leverage = 3 * ((len(independent_variables) + 1) / variables.count()["operatingprofitgrowth"])
leverages[leverages['Leverage'] > threshold_leverage]

Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,Leverage
41,0.580348,0.167253,0.020691,-1.149488,0.024636
57,1.115374,22.794987,0.824053,0.009715,0.015523
81,1.387921,0.571651,0.003806,-1.018040,0.019357
91,0.000000,0.766084,0.673480,-0.095160,0.010180
93,0.959993,10.281528,0.426055,-0.417307,0.006555
...,...,...,...,...,...
3413,4.182505,0.599549,0.306272,-0.851169,0.014577
3528,-30.111425,0.874405,0.014813,-1.354837,0.034986
3530,121.333333,0.999248,0.013630,0.261558,0.011773
3566,366.200000,2.027569,0.077280,-0.037273,0.092310


In [13]:
#Inbuilt version leverages
df.replace(np.nan, 0, inplace=True)
model = sm.OLS(df[dependent_variables], sm.add_constant(df[independent_variables])).fit()
influence = model.get_influence()
leverage = influence.hat_matrix_diag
leverages_sm = pd.DataFrame(data={"Leverage": leverage})
leverages_sm = pd.concat([variables, leverages_sm], axis=1, join="inner")
leverages_sm[leverages_sm['Leverage'] > threshold_leverage]


Unnamed: 0,operatingprofitgrowth,booktomarketratio,shortfinancialleverage,epsp,Leverage
41,0.580348,0.167253,0.020691,-1.149488,0.024636
57,1.115374,22.794987,0.824053,0.009715,0.015523
81,1.387921,0.571651,0.003806,-1.018040,0.019357
91,0.000000,0.766084,0.673480,-0.095160,0.010180
93,0.959993,10.281528,0.426055,-0.417307,0.006555
...,...,...,...,...,...
3413,4.182505,0.599549,0.306272,-0.851169,0.014577
3528,-30.111425,0.874405,0.014813,-1.354837,0.034986
3530,121.333333,0.999248,0.013630,0.261558,0.011773
3566,366.200000,2.027569,0.077280,-0.037273,0.092310


In [14]:
threshold_std_residual = 3
predicted_values = model.predict(sm.add_constant(df[independent_variables]))
errors = np.matrix(df[dependent_variables]).T - np.matrix(predicted_values)

squared_errors = np.square(errors)
mse = squared_errors.sum() / (variables.count()["operatingprofitgrowth"]-(len(independent_variables) + 1))
se = np.sqrt(mse * (1 - leverage))
influence_sum = errors / se
std_residuals = pd.DataFrame(influence_sum.T)
std_residuals.columns = ["student_resid"]
std_residuals = pd.concat([df.f1, std_residuals], axis=1)
std_residuals[std_residuals["student_resid"] > threshold_std_residual]

Unnamed: 0,f1,student_resid
368,-1.9825,4.245228
390,-0.666268,3.118474
631,-0.50588,14.721748
1262,-2.594291,4.265826
1599,-0.315254,5.417611
2676,1.591808,3.01276


In [15]:
#Standarized residuals
influence_sum_inbuilt = influence.summary_frame()
f1_res = pd.concat([df.f1, influence_sum_inbuilt], axis=1)
std_residuals_inbuilt = f1_res[["f1", "student_resid"]]
std_residuals_inbuilt[std_residuals_inbuilt["student_resid"] > threshold_std_residual]

Unnamed: 0,f1,student_resid
368,-1.9825,4.2553
390,-0.666268,3.122259
631,-0.50588,15.183718
1262,-2.594291,4.276052
1599,-0.315254,5.43907
2676,1.591808,3.016145


In [None]:
#¿Cómo funcionan las pruebas?
#TODO

2. Hace un análisis de multicolinealidad explicando la prueba e implicaciones en el modelo. 

In [16]:
vif_info = pd.DataFrame()
dependent_variables_df = df[dependent_variables]
independent_variables_df = df[independent_variables]
temporal_df = pd.concat([dependent_variables_df, independent_variables_df], axis=1)
vif_info["feature"] = temporal_df.columns
vif_info["value"] = [vif(temporal_df.values, i) for i in range(len(temporal_df.columns))]
vif_info

Unnamed: 0,feature,value
0,f1,1.330047
1,operatingprofitgrowth,1.004535
2,booktomarketratio,1.000229
3,shortfinancialleverage,1.047508
4,epsp,1.283162


Debido a que un valor conservador de VIF para problemas de multicolinearidad es de 2.5 y ninguno de nuestros valores se acerca a este valor podemos decir que no hay problemas de este tipo

3. Propone e implementa soluciones a los problemas de los puntos anteriores para que el modelo sea el más adecuado. 

In [None]:
#TODO

4. Estima e interpreta un modelo de regresión múltiple después de atender los problemas anteriores. Tiene que utilizar álgebra matricial para estimar coeficientes y errores estándar del modelo de regresión, y utilizar funciones de Python para corroborar resultados. 

In [17]:
variables = pd.DataFrame(data=df[independent_variables])
variables.replace(np.nan, 0, inplace=True)
x = np.matrix(variables)
x = np.c_[ones_vector, x]
y = np.matrix(df[dependent_variables])
bs = np.linalg.inv(x.T * x) * x.T * y
bs = pd.DataFrame(bs)
variables = ["Const"] + independent_variables
variables
bs.index = variables 
bs

Unnamed: 0,0
Const,-0.384317
operatingprofitgrowth,-0.001474
booktomarketratio,1.4e-05
shortfinancialleverage,-0.287654
epsp,2.528343


In [52]:
error = y - x * np.matrix(bs)
varcovarerror = error * error.T
error_variance = np.diag(varcovarerror)
np.matrix(error_variance)
# np.linalg.inv(x.T * x).shape
# sde = np.matrix(error_variance) * np.linalg.inv(x.T * x)
# sde

matrix([[ 1.00000000e+00,  1.14634146e+00,  7.70547210e+00,
          0.00000000e+00,  7.12594998e-03],
        [ 1.00000000e+00,  2.34545455e+00,  2.66770576e+00,
          6.25469102e-05,  2.80975699e-02],
        [ 1.00000000e+00,  9.83767962e-01,  5.57123116e-01,
          0.00000000e+00, -2.22291939e-02],
        ...,
        [ 1.00000000e+00,  5.58524173e-01,  2.29876042e+01,
          5.00625782e-03,  6.78240330e-03],
        [ 1.00000000e+00,  7.62699469e-01,  8.66173547e-01,
          9.35717070e-02, -2.41984462e-01],
        [ 1.00000000e+00,  1.12924957e+00,  1.33528421e+00,
          2.83018402e-03, -9.75748982e-02]])

In [None]:
model = sm.OLS(df[dependent_variables], sm.add_constant(df[independent_variables])).fit()
model.summary()

0,1,2,3
Dep. Variable:,f1,R-squared:,0.207
Model:,OLS,Adj. R-squared:,0.206
Method:,Least Squares,F-statistic:,234.8
Date:,"Wed, 12 Oct 2022",Prob (F-statistic):,2.37e-179
Time:,12:48:07,Log-Likelihood:,-3319.0
No. Observations:,3606,AIC:,6648.0
Df Residuals:,3601,BIC:,6679.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.3843,0.011,-36.050,0.000,-0.405,-0.363
operatingprofitgrowth,-0.0015,0.001,-2.922,0.004,-0.002,-0.000
booktomarketratio,1.362e-05,8.26e-05,0.165,0.869,-0.000,0.000
shortfinancialleverage,-0.2877,0.094,-3.052,0.002,-0.472,-0.103
epsp,2.5283,0.084,30.070,0.000,2.363,2.693

0,1,2,3
Omnibus:,745.202,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16469.976
Skew:,-0.404,Prob(JB):,0.0
Kurtosis:,13.439,Cond. No.,1150.0
