<a href="https://colab.research.google.com/github/DLPY/Regression-Session-2/blob/master/Multiple_Regression_Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Pandas, Pyplot and Read data

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

%matplotlib inline

pd.set_option('display.max_colwidth', None)

# CSV is first read in from a github raw file another option is to import the notebook to your session storage by click on the file icon on left toolbar then importing csv
! wget https://raw.githubusercontent.com/DLPY/Regression-Session-1/master/Data/Diamonds.csv

In [None]:
# Once we have the csv file pd.read_csv() converts it to a pandas dataframe
df = pd.read_csv('Diamonds.csv')

## 2. Investigating the Data

In [None]:
# First five columns 
df.head()

# Diamonds Data Set
## Overall a clean dataset with no missing values or messy data.
### approximately 54K observations
### 10 variables including:
**carat** - weight of the diamond ranging between 0.2-5.01

**cut** - the quality of the cut
- 'Ideal': 1
- 'Good': 2
- 'Very Good': 3
- 'Fair': 4
- 'Premium': 5

**color** - diamond color   J (worst) to D (best)
- 'E': 1
- 'D': 2
- 'F': 3
- 'G': 4
- 'H': 5
- 'I': 6
- 'J':7

**clarity** - measurement of how clear the diamond is: I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)
- 'VVS1': 1
- 'IF': 2
- 'VVS2': 3
- 'VS1':4
- 'I1':5
- 'VS2':6
- 'SI1':7
- 'SI2':8

**depth** - total depth percentage ranging between 43-79

**table** - width of top of diamond relative to widest point ranging between 43-95

**price** - price in US dollars ranging between 326-18,823(USD)

**x** (length in mm)

**y** (width in mm)

**z** (depth in mm)


In [None]:
# check types, nulls and counts
df.info()

In [None]:
# investigate the variability of the numeric columns
df.describe()

## Multiple ways of exploring regression, we will look into the below two methods.

1.   Scikit-learn if you don’t need detailed results and want to use the approach consistent with other regression techniques.
2.   Statsmodels if you need the advanced statistical parameters of a model.

## 3. Split the data to prepare training and testing sets

In [None]:
# Independent Variable
X = df.drop(['price'], axis=1).values

# Depenedent Variable
y = df.price.values

# Split Observations in 80% training set 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
print('Training Data:', X_train.shape, y_train.shape)
print('Testing Data:', X_test.shape, y_test.shape)

## 4. Train the model 

In [None]:
regressor = LinearRegression() 
regressor.fit(X_train, y_train)

In [None]:
# Coefficient and Intercept
print(regressor.coef_)
print(regressor.intercept_)

In [None]:
# Create dataframe from regressor coefficient to display results in a dataframe
column_names = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
coefficient_df = pd.DataFrame(regressor.coef_).T # T - Transpose dataframe rows to columns
coefficient_df.columns = column_names
coefficient_df

## 5. Predict the test values and calculate error

In [None]:
# Predict test set from model built during training 
y_pred = regressor.predict(X_test)

In [None]:
# Mean Squared Error Average ((y_pred - y_test)^2)
mse = mean_squared_error(y_pred, y_test)

# Root Mean Squared Error - Root of above
rmse = np.sqrt(mse)

# Coefficient of Determination (R-Square)
rsquared = r2_score(y_pred, y_test)

# Print Results
print('Linear Regression MSE: {}'.format(mse))
print('Linear Regression RMSE: {}'.format(rmse))
print('Coefficient of determination(r^2): {}'.format(rsquared))

## 6. Regression on Full data using OLS model

In [None]:
Regression = sm.OLS(endog=y, exog=X).fit()
print(Regression.summary())