<a href="https://colab.research.google.com/github/DLPY/Regression-Session-1/blob/master/Simple_Linear_Regression_(Diamonds).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import Pandas, Pyplot and Read Data



In [None]:
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
%matplotlib inline
pd.set_option('display.max_colwidth', None)

# CSV is first read in from a github raw file another option is to import the notebook to your session storage by click on the file icon on left toolbar then importing csv
! wget https://raw.githubusercontent.com/DLPY/Regression-Session-1/master/Data/Diamonds.csv

In [None]:
# Once we have the csv file pd.read_csv() converts it to a pandas dataframe
# https://medium.com/swlh/exploratory-data-analysis-21bbf3887e28#:~:text=R%20diamond.csv%20dataset%20includes%20approximately%2054K%20observations%20with,messy%20data.%20Structure%20of%20the%20dataset%20%28R%20lang%29
df = pd.read_csv('Diamonds.csv')

## 2. Investigating the Data

In [None]:
# First five columns 
df.head(5)

# Diamonds Data Set
## Overall a clean dataset with no missing values or messy data.
### approximately 54K observations
### 10 variables including:
- carat - weight of the diamond ranging between 0.2-5.01
- cut - the quality of the cut
- 'Ideal': 1
- 'Good': 2
- 'Very Good': 3
- 'Fair': 4
- 'Premium': 5
- color - diamond color   J (worst) to D (best)
- 'E': 1
- 'D': 2
- 'F': 3
- 'G': 4
- 'H': 5
- 'I': 6
- 'J':7
- clarity - measurement of how clear the diamond is: I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best)
- 'VVS1': 1
- 'IF': 2
- 'VVS2': 3
- 'VS1':4
- 'I1':5
- 'VS2':6
- 'SI1':7
- 'SI2':8
- depth - total depth percentage ranging between 43-79
- table - width of top of diamond relative to widest point ranging between 43-95
- price - price in US dollars ranging between 326-18,823(USD)
- x (length in mm)
- y (width in mm)
- z (depth in mm)


In [None]:
# check types, nulls and counts
df.info()

In [None]:
# investigate the variability of the numeric columns
df.describe()

## 3. Exploratory Data Analysis (EDA in Diamond Dataset):

---

In [None]:
# Pandas profiling report
profile = ProfileReport(df, title='Sydney House Prices Profiling Report', explorative=True)

In [None]:
# for Jupyter exploration only
profile.to_notebook_iframe()

In [None]:
# export analysis results to an html page, for sharing to a wider audience and non-Jupyter users.
profile.to_file('SydneyHousePrices.html')

## 4. Split the dataset to prepare for training 

---





In [None]:
# Independent Variable
X = df.drop(['price'],axis=1).values

# Depenedent Variable
y = df.price.values

# Split Observations in 75% training set 25% test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)


## 5. Train the model 

In [None]:
import statsmodels.api as sm
Regression = sm.OLS(endog=y, exog=X).fit()
print(Regression.summary())

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() 
regressor.fit(X_train, y_train)

In [None]:
# Coefficient and Intercept
print(regressor.coef_)
print(regressor.intercept_)

## 6. Predict the test values and Calculate Error

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Predict test set from model built during training 
y_pred = regressor.predict(X_test)

In [None]:
# Mean Squared Error Average ((y_pred - y_test)^2)
mse = mean_squared_error(y_pred, y_test)

# Root Mean Squared Error - Root of above
rmse = np.sqrt(mse)

# Print Results
print("Linear Regression MSE: {}".format(mse))
print("Linear Regression RMSE: {}".format(rmse))

from sklearn.metrics import r2_score
rsquared = r2_score(y_pred, y_test)
print("Coefficient of determination(r^2): {}".format(rsquared))

## 7. Visualise the Train and Test Set 

In [None]:
plt.style.use('ggplot')
residual_train = y_train - regressor.predict(X_train) 
residual_test = y_test-regressor.predict(X_test)
plt.scatter(residual_train, regressor.predict(X_train),
            color='green', s=10, label='Train data')
plt.scatter(residual_test,regressor.predict(X_test),
            color='blue', s=10, label='Test data')
plt.hlines(y=0, xmin=0, xmax=100, linewidth=5)
plt.legend(loc='upper right')
plt.title("Residual errors")
plt.rc('font', size=12)   
plt.show()