# PART 5: Regression Analysis

## Multiple Regression

### The Movie Dataset - Preparing the Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movie = pd.read_csv("movies_metadata.csv", low_memory= False)

In [None]:
movie.head()

In [None]:
movie.info()

In [None]:
movie = movie.set_index(pd.to_datetime(movie.release_date, errors = "coerce")).drop(columns = ["release_date"])

In [None]:
movie.sort_index(inplace = True)

In [None]:
df = movie.loc["2010":"2016", ["title", "budget", "revenue", "runtime", "vote_average", "popularity"]].copy()

In [None]:
df

In [None]:
df.info()

In [None]:
df.budget = pd.to_numeric(df.budget, errors = "coerce")

In [None]:
df.popularity = pd.to_numeric(df.popularity, errors = "coerce")

In [None]:
df = df[(df.revenue > 0) & (df.budget > 0)]

In [None]:
df

In [None]:
df.set_index("title", inplace = True)

In [None]:
df.info()

In [None]:
df.loc[:, ["budget", "revenue"]] = df.loc[:, ["budget", "revenue"]] / 1000000

In [None]:
df

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
sns.pairplot(df, kind = "reg")
plt.plot()

### Multiple Regression Analysis with statsmodels

In [None]:
df

In [None]:
df.info()

In [None]:
from statsmodels.formula.api import ols

In [None]:
model = ols("revenue ~ budget + runtime + vote_average + popularity", data = df)

In [None]:
results = model.fit()

In [None]:
results.params

In [None]:
results.predict()

In [None]:
df.insert(loc = 2, column = "predict", value = results.predict())

In [None]:
df

In [None]:
df.revenue - df.predict

In [None]:
results.resid

In [None]:
df = df.sort_values("revenue").reset_index()

In [None]:
df

In [None]:
df.loc[:, ["revenue", "predict"]].plot(figsize = (15, 10))
plt.legend(fontsize = 15)
plt.show()

In [None]:
results.ssr

In [None]:
results.mse_resid

In [None]:
results.mse_resid**0.5

### Coefficient of Determination: the Adjusted R squared

In [None]:
results.rsquared

In [None]:
results.rsquared_adj

### Regression Coefficients, Hypothesis Testing & Model Specification

In [None]:
pd.options.display.float_format = '{:.4f}'.format

In [None]:
df

In [None]:
results.params

In [None]:
results.tvalues

In [None]:
results.pvalues

In [None]:
print(results.summary())

In [None]:
sns.regplot("runtime", "revenue", data = df)
plt.plot()

In [None]:
sns.regplot("budget", "runtime", data = df)
plt.plot()

In [None]:
model = ols("revenue ~  budget + vote_average + popularity", data = df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

### The F-Test

In [None]:
results.fvalue

In [None]:
results.f_pvalue

In [None]:
print(results.summary())

### Creating and working with Dummy Variables (Part 1)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movie = pd.read_csv("movies_metadata.csv", low_memory= False)

In [None]:
movie = movie.set_index(pd.to_datetime(movie.release_date, errors = "coerce")).drop(columns = ["release_date"])

In [None]:
movie.sort_index(inplace = True)

In [None]:
df = movie.loc["2010":"2016", ["title", "budget", "revenue", "vote_average", "popularity",
                              "belongs_to_collection", "original_language"]].copy()

In [None]:
df.budget = pd.to_numeric(df.budget, errors = "coerce")

In [None]:
df.popularity = pd.to_numeric(df.popularity, errors = "coerce")

In [None]:
df = df[(df.revenue > 0) & (df.budget > 0)]

In [None]:
df.loc[:, ["budget", "revenue"]] = df.loc[:, ["budget", "revenue"]] / 1000000

In [None]:
df.sort_values("revenue", ascending = False)

In [None]:
df.belongs_to_collection.notnull()

In [None]:
df.belongs_to_collection = df.belongs_to_collection.notnull()

In [None]:
df.set_index("title", inplace = True)

In [None]:
df

In [None]:
plt.figure(figsize = (12,8))
sns.set(font_scale=1.5)
sns.regplot("belongs_to_collection", "revenue", data = df)
plt.plot()

In [None]:
from statsmodels.formula.api import ols

In [None]:
model = ols("revenue ~ budget + popularity + vote_average + belongs_to_collection", data = df)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

### Creating and working with Dummy Variables (Part 2)

In [None]:
df

In [None]:
df.original_language.nunique()

In [None]:
df.original_language.value_counts()

In [None]:
df_dumm = pd.get_dummies(df, columns = ["original_language"])

In [None]:
df_dumm

In [None]:
df_dumm.columns.str.contains("language").sum()

In [None]:
plt.figure(figsize = (12,8))
sns.set(font_scale=1.5)
sns.regplot("original_language_en", "revenue", data = df_dumm)
plt.plot()

In [None]:
model = ols("revenue ~ original_language_en", data = df_dumm)

In [None]:
results = model.fit()

In [None]:
print(results.summary())

In [None]:
model = ols("revenue ~ budget + popularity + vote_average + belongs_to_collection", data = df_dumm)

In [None]:
results = model.fit()

In [None]:
print(results.summary())