# Exercise - Dummies and VIF

Please run all the cells below and find the exercise at the bottom of the notebook.

## Importing the relevant libraries

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set()

## Loading the raw data

In [4]:
raw_data = pd.read_csv('1.04. Real-life example.csv')
raw_data.head()

FileNotFoundError: [Errno 2] No such file or directory: '1.04. Real-life example.csv'

## Preprocessing

### Exploring the descriptive statistics of the variables

In [None]:
raw_data.describe(include='all')

### Determining the variables of interest

In [None]:
data = raw_data.drop(['Model'],axis=1)
data.describe(include='all')

### Dealing with missing values

In [None]:
data.isnull().sum()

In [None]:
data_no_mv = data.dropna(axis=0)

In [None]:
data_no_mv.describe(include='all')

### Exploring the PDFs

In [None]:
sns.distplot(data_no_mv['Price'])

### Dealing with outliers

In [None]:
q = data_no_mv['Price'].quantile(0.99)
data_1 = data_no_mv[data_no_mv['Price']<q]
data_1.describe(include='all')

In [None]:
sns.distplot(data_1['Price'])

In [None]:
sns.distplot(data_no_mv['Mileage'])

In [None]:
q = data_1['Mileage'].quantile(0.99)
data_2 = data_1[data_1['Mileage']<q]

In [None]:
sns.distplot(data_2['Mileage'])

In [None]:
sns.distplot(data_no_mv['EngineV'])

In [None]:
data_3 = data_2[data_2['EngineV']<6.5]

In [None]:
sns.distplot(data_3['EngineV'])

In [None]:
sns.distplot(data_no_mv['Year'])

In [None]:
q = data_3['Year'].quantile(0.01)
data_4 = data_3[data_3['Year']>q]

In [None]:
sns.distplot(data_4['Year'])

In [None]:
data_cleaned = data_4.reset_index(drop=True)

In [None]:
data_cleaned.describe(include='all')

## Checking the OLS assumptions

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['Price'])
ax1.set_title('Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['Price'])
ax2.set_title('Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['Price'])
ax3.set_title('Price and Mileage')


plt.show()

In [None]:
sns.distplot(data_cleaned['Price'])

### Relaxing the assumptions

In [None]:
log_price = np.log(data_cleaned['Price'])
data_cleaned['log_price'] = log_price
data_cleaned

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize =(15,3))
ax1.scatter(data_cleaned['Year'],data_cleaned['log_price'])
ax1.set_title('Log Price and Year')
ax2.scatter(data_cleaned['EngineV'],data_cleaned['log_price'])
ax2.set_title('Log Price and EngineV')
ax3.scatter(data_cleaned['Mileage'],data_cleaned['log_price'])
ax3.set_title('Log Price and Mileage')


plt.show()

In [None]:
data_cleaned = data_cleaned.drop(['Price'],axis=1)

### Multicollinearity

In [None]:
data_cleaned.columns.values

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = data_cleaned[['Mileage','Year','EngineV']]
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif["features"] = variables.columns

In [None]:
vif

In [None]:
data_no_multicollinearity = data_cleaned.drop(['Year'],axis=1)

## Create dummy variables

In [None]:
data_with_dummies = pd.get_dummies(data_no_multicollinearity, drop_first=True)

In [None]:
data_with_dummies.head()

### Rearrange a bit

In [None]:
data_with_dummies.columns.values

In [None]:
cols = ['log_price', 'Mileage', 'EngineV', 'Brand_BMW',
       'Brand_Mercedes-Benz', 'Brand_Mitsubishi', 'Brand_Renault',
       'Brand_Toyota', 'Brand_Volkswagen', 'Body_hatch', 'Body_other',
       'Body_sedan', 'Body_vagon', 'Body_van', 'Engine Type_Gas',
       'Engine Type_Other', 'Engine Type_Petrol', 'Registration_yes']

In [None]:
data_preprocessed = data_with_dummies[cols]
data_preprocessed.head()

***

***

***

# EXERCISE

### Part 1
Calculate the variance inflation factors for all variables contained in data_preprocessed. Anything strange?

### Part 2
As mentioned in the lecture, your task is to calculate the variance inflation factor (VIF) of all variables including the dummies (but without the dependent variable).

### Part 3
Now calculate the VIFs for a data frame where we include the dummies, without 'log_price', but DO NOT DROP THE FIRST DUMMY. Anything strange now?