# PART 5: Regression Analysis

## Issues in Linear Regression Analysis and Logistic Regression

## Linear Regression - not that easy (Anscombe Dataset)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
anscombe = pd.read_csv("anscombe.csv", index_col = "dataset", usecols = ["dataset", "x", "y"])

In [None]:
anscombe

In [None]:
anscombe.loc["I"].corr()

In [None]:
sns.lmplot(x = "x", y = "y", data = anscombe.loc["I"])
plt.plot()

In [None]:
anscombe.loc["II"].corr()

In [None]:
sns.lmplot(x = "x", y = "y", data = anscombe.loc["II"])
plt.plot()

In [None]:
anscombe.loc["III"].corr()

In [None]:
sns.lmplot(x = "x", y = "y", data = anscombe.loc["III"])
plt.plot()

In [None]:
anscombe.loc["IV"].corr()

In [None]:
sns.lmplot(x = "x", y = "y", data = anscombe.loc["IV"])
plt.plot()

In [None]:
for i in ["I", "II", "III", "IV"]:
    print(np.polyfit(x = anscombe.loc[i].x, y = anscombe.loc[i].y, deg = 1))

### Detecting and handling Outliers (Part 1)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv("movies_prep.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.plot(figsize = (15, 10), subplots = True)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
df.budget.hist(bins = 100)
plt.show()

In [None]:
plt.figure(figsize = (12, 8))
df.boxplot(column = "budget")
plt.show()

In [None]:
df.plot(kind = "scatter", x = "budget", y = "revenue", figsize = (12, 8), fontsize = 15)
plt.xlabel("Budget (in MUSD)", fontsize = 13)
plt.ylabel("Revenue (in MUSD)", fontsize = 13)
plt.show()

In [None]:
df2 = df.copy()

In [None]:
bud_cap = np.percentile(df2.budget, 98)
bud_cap

In [None]:
df2.loc[df2.budget > bud_cap, "budget"] =  bud_cap

In [None]:
rev_cap = np.percentile(df2.revenue, 98)
rev_cap

In [None]:
df2.loc[df2.revenue > rev_cap, "revenue"] =  rev_cap

In [None]:
vote_floor = np.percentile(df2.vote_average, 1)
vote_floor

In [None]:
df2.loc[df2.vote_average < vote_floor, "vote_average"] =  vote_floor

In [None]:
vote_cap = np.percentile(df2.vote_average, 99)
vote_cap

In [None]:
df2.loc[df2.vote_average > vote_cap, "vote_average"] =  vote_cap

In [None]:
df.nlargest(10, "popularity")

In [None]:
pop_cap = np.percentile(df2.popularity, 98)
pop_cap

In [None]:
df2.loc[df2.popularity > pop_cap, "popularity"] =  pop_cap

In [None]:
df2.describe()

In [None]:
df2.plot(kind = "scatter", x = "budget", y = "revenue", figsize = (12, 8), fontsize = 15)
plt.xlabel("Budget (in MUSD)", fontsize = 13)
plt.ylabel("Revenue (in MUSD)", fontsize = 13)
plt.show()

### Detecting and handling Outliers (Part 2)

In [None]:
df

In [None]:
df2

In [None]:
model = ols("revenue ~ budget + popularity + vote_average + belongs_to_collection", data = df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

In [None]:
model = ols("revenue ~ budget + popularity + vote_average + belongs_to_collection", data = df2)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

### Non-Linear Relationships - Transforming Variables

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

__Example 1__

In [None]:
x = np.arange(1, 21)
x

In [None]:
y = 2 * 1/x + 5
y

In [None]:
df = pd.DataFrame(data = {"x" : x, "y": y})

In [None]:
df

In [None]:
sns.lmplot(x = "x", y = "y", data = df)
plt.plot()

In [None]:
df["invx"] = 1 / df.x

In [None]:
df

In [None]:
sns.lmplot(x = "invx", y = "y", data = df)
plt.plot()

__Example 2__

In [None]:
x = np.arange(1, 21)
x

In [None]:
y =  x**2
y

In [None]:
df = pd.DataFrame(data = {"x" : x, "y": y})

In [None]:
df

In [None]:
sns.lmplot(x = "x", y = "y", data = df)
plt.plot()

In [None]:
df["logx"] = np.log(df.x)
df["logy"] = np.log(df.y)

In [None]:
df

In [None]:
sns.lmplot(x = "logx", y = "logy", data = df)
plt.plot()

### Detecting and Handling Multicollinearity

In [None]:
import pandas as pd
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv("movies_prep.csv")

In [None]:
df

In [None]:
df["budget_EUR"] = df.budget / 1.1

In [None]:
df

In [None]:
df.budget.corr(df.budget_EUR)

In [None]:
model = ols("revenue ~ budget + popularity + belongs_to_collection + vote_average", data = df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

In [None]:
df_dumm = pd.get_dummies(df, columns = ["belongs_to_collection"])

In [None]:
df_dumm

In [None]:
model = ols("revenue ~ budget + popularity + belongs_to_collection_False + vote_average", data = df_dumm)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

### Detecting and Correcting Heteroskedasticity

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

In [None]:
df = pd.read_csv("movies_prep.csv")

In [None]:
df

In [None]:
plt.figure(figsize = (15, 10))  
sns.regplot(data = df, x = "budget", y = "revenue")
plt.show()

In [None]:
model = ols("revenue ~ budget" , data = df)

In [None]:
results = model.fit()

In [None]:
df["resid"] = results.resid

In [None]:
df

In [None]:
df.plot(kind = "scatter", x = "budget", y = "resid", figsize = (12, 8))
plt.show()

In [None]:
from statsmodels.stats.diagnostic import het_breuschpagan

In [None]:
bp_test = het_breuschpagan(results.resid,  results.model.exog)

In [None]:
bp_test

In [None]:
labels = ["LM Statistic", "LM-Test p-value", "F-Statistic", "F-Test p-value"]

__H0: Variance of Residuals does not depend on independent variables (Homoskedasticity)__ <br>
__Ha: Variance of Residuals depend on independent variables (Heteroskedasticity)__ 

In [None]:
print(dict(zip(labels, bp_test)))

In [None]:
model = ols("revenue ~ budget + popularity + belongs_to_collection + vote_average", data = df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

In [None]:
results = model.fit(cov_type = "HC1")

In [None]:
print(results.summary())

### Serial Correlation (Autocorrelation)

In [None]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols

In [None]:
df = yf.download(["MSFT", "^SP500TR"], start = "2015-12-31", end = "2018-12-31")

In [None]:
df = df["Adj Close"]

In [None]:
df = df.rename(columns = {"^SP500TR":"SPX"})

In [None]:
df

In [None]:
ret = df.pct_change().dropna()

In [None]:
ret

__Regression of Returns (correct__)

In [None]:
sns.lmplot(x = "SPX", y = "MSFT", data = ret)
plt.show()

In [None]:
model = ols("MSFT ~ SPX", data=ret)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

In [None]:
results.resid

1. Residuals are __normally distributed__ with mean/expected value == 0

In [None]:
plt.figure(figsize = (12, 8))
results.resid.plot(kind = "hist", bins = 100)
plt.title("Distribution of Residuals", fontsize = 15)
plt.show()

2. __Residuals are independent__ from each other -> the residual for one observation is not correlated with that of another observation.

In [None]:
plt.figure(figsize = (12, 8))
results.resid.plot()
plt.show()

In [None]:
lag1 = results.resid.shift()
lag1

In [None]:
results.resid.corr(lag1)

__Regression of Stock Prices (Incorrect)__

In [None]:
df

In [None]:
sns.lmplot(x = "SPX", y = "MSFT", data = df)
plt.show()

In [None]:
model = ols("MSFT ~ SPX", data=df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

1. Residuals are __normally distributed__ with mean/expected value == 0

In [None]:
plt.figure(figsize = (12, 8))
results.resid.plot(kind = "hist", bins = 100)
plt.title("Distribution of Residuals", fontsize = 15)
plt.show()

2. __Residuals are independent__ from each other -> the residual for one observation is not correlated with that of another observation.

In [None]:
plt.figure(figsize = (12, 8))
results.resid.plot()
plt.show()

In [None]:
lag1 = results.resid.shift()
lag1

In [None]:
results.resid.corr(lag1)

In [None]:
ret.plot(subplots = True, figsize = (12, 8))
plt.show()

In [None]:
df.plot(subplots = True, figsize = (12, 8))
plt.show()

### Logistic Regression with statsmodels (Part 1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic

In [None]:
titanic.info()

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = titanic, x = "fare", y = "survived", aspect= 1, height=8, logistic= False)
plt.show()

In [None]:
sns.set(font_scale=1.5)
sns.lmplot(data = titanic, x = "fare", y = "survived", aspect= 1, height=8, logistic= True)
plt.show()

In [None]:
from statsmodels.formula.api import logit

In [None]:
model = logit("survived ~ fare", data = titanic)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

### Logistic Regression with statsmodels (Part 2)

In [None]:
import pandas as pd
from statsmodels.formula.api import logit

In [None]:
titanic = pd.read_csv("titanic.csv")

In [None]:
titanic

In [None]:
titanic.age.fillna(titanic.age.mean(), inplace = True)

In [None]:
titanic.info()

In [None]:
titanic = pd.get_dummies(titanic, columns = ["sex", "pclass"], drop_first=False)

In [None]:
titanic

In [None]:
titanic["Rel"] = titanic.sibsp + titanic.parch 

In [None]:
titanic

In [None]:
model = logit("survived ~ fare + age + sex_male + pclass_1 + pclass_3 + Rel", data = titanic)

In [None]:
results = model.fit()

In [None]:
print(results.summary())