# PART 5: Regression Analysis

## OLS Regression, ANOVA and Hypothesis Testing

### OLS Regression with Statsmodels (Intro)

In [None]:
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [None]:
x = np.arange(1, 10)
x

In [None]:
y = np.array([ 2,  6,  7,  7, 11, 10, 15, 15, 18])
y

In [None]:
plt.figure(figsize = (12, 8))
plt.scatter(x, y)
plt.xlabel("X", fontsize = 17)
plt.ylabel("Y", fontsize = 17)
plt.show()

In [None]:
np.corrcoef(x,y)

In [None]:
np.polyfit(x = x, y = y, deg = 1)

In [None]:
x = sm.add_constant(x)
x

In [None]:
model = sm.OLS(y,x)

In [None]:
model

In [None]:
results = model.fit()

In [None]:
results

In [None]:
results.params

In [None]:
pred = results.predict(x)
pred

In [None]:
plt.figure(figsize = (12, 8))
plt.scatter(x = x[:,1], y = y, alpha = 0.5, label = "Data")
plt.plot(x[:,1], pred, c = "r", alpha = 0.5, label = "Regression Line")
for i in range(len(y)):
    plt.vlines(x[i,1], ymin = min(pred[i], y[i]), ymax = max(pred[i], y[i]), linestyles = "dashed")
plt.xlabel("X", fontsize = 17)
plt.ylabel("Y", fontsize = 17)
plt.title("OLS Regression", fontsize = 20)
plt.legend(fontsize = 17)
plt.show()

In [None]:
y

In [None]:
pred

In [None]:
errors = y-pred
errors

In [None]:
results.resid

In [None]:
sq_err = errors**2
sq_err

In [None]:
sse = np.sum(sq_err)
sse

In [None]:
results.ssr

In [None]:
mse = sse / (9-2)
mse

In [None]:
results.mse_resid

### OLS Regression with Statsmodels - ANOVA

In [None]:
x = np.arange(1, 10)
x

In [None]:
y

In [None]:
y.mean()

In [None]:
pred

In [None]:
results

In [None]:
plt.figure(figsize = (12, 8))
plt.hlines(y = y.mean(), xmin = 1, xmax = 9, label = "y mean")
plt.scatter(x = x, y = y, alpha = 0.5, label = "Data")
for i in range(len(y)):
    plt.vlines(x[i], ymin = min(y.mean(), y[i]), ymax = max(y.mean(), y[i]), linestyles = "dashed")
plt.xlabel("X", fontsize = 17)
plt.ylabel("Y", fontsize = 17)
plt.title("Total Sum of Squares (TSS)", fontsize = 20)
plt.legend(fontsize = 17)
plt.show()

In [None]:
tss = np.sum((y - y.mean())**2)
tss

In [None]:
results.centered_tss

In [None]:
plt.figure(figsize = (12, 8))
plt.hlines(y = y.mean(), xmin = 1, xmax = 9, label = "y mean")
plt.scatter(x = x, y = y, alpha = 0.5, label = "Data")
plt.plot(x, pred, c = "r", alpha = 0.5, label = "Regression Line")
for i in range(len(y)):
    plt.vlines(x[i], ymin = min(y.mean(), pred[i]), ymax = max(y.mean(), pred[i]), linestyles = "dashed")
plt.xlabel("X", fontsize = 17)
plt.ylabel("Y", fontsize = 17)
plt.title("Regression Sum of Squares (RSS)", fontsize = 20)
plt.legend(fontsize = 17)
plt.show()

In [None]:
rss = np.sum((pred - y.mean())**2)
rss

In [None]:
results.mse_model

In [None]:
plt.figure(figsize = (12, 8))
plt.plot(x, pred, c = "r", alpha = 0.5, label = "Regression Line")
plt.scatter(x = x, y = y, alpha = 0.5, label = "Data")
for i in range(len(y)):
    plt.vlines(x[i], ymin = min(pred[i], y[i]), ymax = max(pred[i], y[i]), linestyles = "dashed")
plt.xlabel("x", fontsize = 17)
plt.ylabel("y", fontsize = 17)
plt.legend(fontsize = 17)
plt.title("Sum of Squared Errors (SSE)", fontsize = 20)
plt.show()

In [None]:
np.sum((y - pred)**2)

In [None]:
tss - rss

In [None]:
results.ssr

### Coefficient of Determination - R^2

In [None]:
tss = results.centered_tss
tss

In [None]:
rss = results.mse_model
rss

In [None]:
sse = results.ssr
sse

In [None]:
r_squared = rss/tss
r_squared

In [None]:
results.rsquared

In [None]:
np.corrcoef(x, y)[0,1]**2

### OLS Regression with statsmodels and DataFrames

In [None]:
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
model = ols("revenue ~ budget", data = df)

In [None]:
results = model.fit()

In [None]:
results.params

In [None]:
results.rsquared

In [None]:
tss = results.centered_tss
tss

In [None]:
rss = results.mse_model
rss

In [None]:
sse = results.ssr
sse

In [None]:
r_squared = results.rsquared
r_squared

In [None]:
rss/tss

### Confidence Intervals for Regression Coefficients - Bootstrapping

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=2, suppress= True)

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
sims = 10000

In [None]:
array = df.iloc[:, -2:].values
array

In [None]:
size = array.shape[0]
size

In [None]:
np.random.seed(123)
indxs = np.random.randint(low = 0, high = size, size = size * sims).reshape(sims, size)

In [None]:
indxs.shape

In [None]:
indxs

In [None]:
array[indxs[0], :]

In [None]:
array[indxs[0], :].shape

In [None]:
coeff = np.empty(sims * 2).reshape(sims, 2)
for i in range(sims):
    bootstr = array[indxs[i], :]
    coeff[i] = np.polyfit(x = bootstr[:, 0], y = bootstr[:, 1], deg = 1)

In [None]:
coeff

In [None]:
coeff.shape

In [None]:
plt.figure(figsize = (12 ,8))
plt.hist(coeff[:, 0], bins = 100)
plt.title("Slope Coefficient", fontsize = 15)
plt.plot()

In [None]:
np.percentile(coeff[:, 0], [2.5, 97.5])

In [None]:
plt.figure(figsize = (12 ,8))
plt.hist(coeff[:, 1], bins = 100)
plt.title("Intercept", fontsize = 15)
plt.plot()

In [None]:
np.percentile(coeff[:, 1], [2.5, 97.5])

In [None]:
x = np.array([df.budget.min(), df.budget.max()])
x

In [None]:
df.iloc[:,-2:].plot(kind = "scatter", x = "budget", y = "revenue", figsize = (8, 8))
for i in range(sims):
    y = np.polyval(coeff[i], x)
    plt.plot(x,y, color = "red")
plt.show()

In [None]:
import seaborn as sns

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = df, x = "budget", y = "revenue", height = 8, n_boot = 10000, ci = 100)
plt.show()

### Hypothesis Testing of Regression Coefficients

In [None]:
import pandas as pd
from statsmodels.formula.api import ols
pd.options.display.float_format = '{:.10f}'.format

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
model = ols("revenue ~ budget", data = df)

In [None]:
results = model.fit()

In [None]:
results.params

In [None]:
results.rsquared

In [None]:
results.tvalues

In [None]:
results.pvalues

### Testing with statsmodels – interpreting the Summary Table

In [None]:
import pandas as pd
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv("bud_vs_rev.csv", parse_dates = ["release_date"], index_col = "release_date")

In [None]:
df = df.loc["2016"]

In [None]:
df

In [None]:
model = ols("revenue ~ budget", data=df)

In [None]:
results = model.fit()

In [None]:
results.summary()

In [None]:
print(results.summary())

## Case Study: The Market Model (Single Factor Model)

In [None]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

In [None]:
df = yf.download(["MSFT", "^SP500TR"], start = "2015-12-31", end = "2018-12-31")

In [None]:
df

In [None]:
df = df["Adj Close"]

In [None]:
df

In [None]:
df = df.rename(columns = {"^SP500TR":"SPX"})

In [None]:
df

In [None]:
ret = df.pct_change().dropna()

In [None]:
ret

In [None]:
ret.plot(x = "SPX", y = "MSFT", figsize = (12, 8), kind = "scatter")
plt.show()

In [None]:
model = ols("MSFT ~ SPX", data=ret)

In [None]:
results = model.fit()

In [None]:
print(results.summary())