# <center>AISC2006 - Step Presentation (Step 2)</center>
## <center>Presentation - 1</center>

## <center>Predicting the price of Diamonds</center>

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")

### Load Dataset

In [None]:
diamonds = pd.read_csv("Data/diamond.csv")

In [None]:
features = []
def get_column_names(column_names):
    for col in column_names:
        features.append(col)

In [None]:
get_column_names(diamonds.columns)
features


### Data cleaning

In [None]:
diamonds.shape

In [None]:
# Check if the dataset has null values
diamonds.isnull().sum()

In [None]:
# Check if the dataset has duplicate values
diamonds_duplicate = diamonds[diamonds.duplicated()]
diamonds_duplicate.head()

In [None]:
diamonds = diamonds.drop_duplicates()

In [None]:
diamonds.shape

In [None]:
# Create a copy of the dataset
diamonds_copy = diamonds
diamonds_copy.head()

### Exploratory Data Analysis

In [None]:
# Unique values in eaach column
diamonds.nunique()

In [None]:
r = 3
c = 2

fig, axes = plt.subplots(r, c, figsize=(10,10))
for i, col in enumerate(diamonds.columns[1:-1]):
    ax_col = int(i%r)
    ax_row = int(i/r)
    
    sns.barplot(x = diamonds[col], y = diamonds['Price'], ax=axes[ax_col, ax_row])
fig.tight_layout()
plt.show()

##### As seen above the data of the report column does not have much of an influence on the price of the diamonds as compared to others

In [None]:
r = 3
c = 2

fig, axes = plt.subplots(r, c, figsize=(10,10))
for i, col in enumerate(diamonds.columns[1:-1]):
    ax_col = int(i%r)
    ax_row = int(i/r)
    
    sns.histplot(diamonds[col], ax=axes[ax_col, ax_row], kde = True)
fig.tight_layout()
plt.show()

### Data Transformation

In [None]:
# Transforming categorical data using lable encoder
from sklearn.preprocessing import LabelEncoder

# Initialize encoder
encode = LabelEncoder()

# Transform the data
diamonds['Cut'] = encode.fit_transform(diamonds['Cut'])
diamonds['Color'] = encode.fit_transform(diamonds['Color'])
diamonds['Clarity'] = encode.fit_transform(diamonds['Clarity'])
diamonds['Polish'] = encode.fit_transform(diamonds['Polish'])
diamonds['Symmetry'] = encode.fit_transform(diamonds['Symmetry'])

In [None]:
# Descriptive analysis for cut column
diamonds_copy.describe()

In [None]:
r = 3
c = 2

fig, axes = plt.subplots(r, c, figsize=(10,10))
for i, col in enumerate(diamonds.columns[1:-1]):
    ax_col = int(i%r)
    ax_row = int(i/r)
    
    sns.boxplot(x = diamonds[col], y = diamonds['Price'], ax=axes[ax_col, ax_row])
fig.tight_layout()
plt.show()

In [None]:
# Correlation diagra,m
plt.figure(figsize=(10,8))
g = sns.heatmap(diamonds.corr(), annot = True, cmap = 'PuRd')

### Split dataset into train and test set

In [None]:
# Selecting features and target data
#x = diamonds_copy.drop(['Price'], axis=1)
#y = diamonds_copy[['Price']]

x = diamonds.drop(['Price','Report'], axis=1)
y = diamonds[['Price']]

In [None]:
# Import librarues for train and test split
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=101)

### Build ML models

The ML algorithms user are:
* Linear Regression
* Ridge Regression
* Lasso Regression
* KNN 
* Decision Tree

In [None]:
#Import libraries for the models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#### Linear Regression model

In [None]:
# Initialize the model
linear = LinearRegression(fit_intercept= True)

# Fit the model
linear.fit(X_train, Y_train)

# Predict values
linear_pred = linear.predict(X_test)

# Performance metrics0
linear_metrics = ['Linear', np.round(mean_squared_error(Y_test, linear_pred),2), np.round(mean_absolute_error(Y_test, linear_pred), 2), 
                  np.round(mean_squared_error(Y_test, linear_pred, squared=False), 2), np.round(linear.score(X_test, Y_test), 2)]

In [None]:
#Visualize the linear regression on testing dataset
plt.scatter(Y_test, linear_pred)
plt.plot(Y_test, Y_test, color = 'r')
plt.ylabel('Predicted Diamond Price')
plt.xlabel('Actual Diamond Price')
plt.show()

#### Ridge Regression model

In [None]:
# Initialize the model
ridge = Ridge()

# Fit the model
ridge.fit(X_train, Y_train)

# Predict the values
ridge_pred = ridge.predict(X_test)

# Performance metrics
ridge_metrics = ['Ridge', np.round(mean_squared_error(Y_test, ridge_pred),2), np.round(mean_absolute_error(Y_test, ridge_pred), 2), 
                 np.round(mean_squared_error(Y_test, ridge_pred, squared=False), 2), np.round(ridge.score(X_test, Y_test), 2)]

In [None]:
# Visualize the Ridge regression on testing dataset
plt.scatter(Y_test, ridge_pred)
plt.plot(Y_test, Y_test, color = 'r')
plt.ylabel('Predicted Diamond Price')
plt.xlabel('Actual Diamond Price')
plt.show()

#### Lasso regreesion model

In [None]:
# Initialize the model
lasso = Lasso()

# Fit the model
lasso.fit(X_train, Y_train)

# Predict the values
lasso_pred = lasso.predict(X_test)

# Performance metrics
lasso_metrics = ['Lasso',np.round(mean_squared_error(Y_test, lasso_pred),2),np.round(mean_absolute_error(Y_test, lasso_pred), 2), 
                 np.round(mean_squared_error(Y_test, lasso_pred, squared=False), 2), np.round(lasso.score(X_test, Y_test),2)]

In [None]:
# Visualize the Lasso regression on testing dataset
plt.scatter(Y_test, lasso_pred)
plt.plot(Y_test, Y_test, color = 'r')
plt.ylabel('Predicted Diamond Price')
plt.xlabel('Actual Diamond Price')
plt.show()

#### K Nearest Neighbor Regression

In [None]:
# Initialize the model.
knn = KNeighborsRegressor(n_neighbors=2)
    
# Fit the model
knn.fit(X_train, Y_train)
    
# Predict the values
knn_pred = knn.predict(X_test)


# Performance metrics
knn_metrics = ['KNN',np.round(mean_squared_error(Y_test, knn_pred),2),np.round(mean_absolute_error(Y_test, knn_pred), 2), 
                np.round(mean_squared_error(Y_test, knn_pred, squared=False), 2), np.round(knn.score(X_test, Y_test),2)]

In [None]:
# Visualize the KNN regression on testing dataset
plt.scatter(Y_test, knn_pred)
plt.plot(Y_test, Y_test, color = 'r')
plt.ylabel('Predicted Diamond Price')
plt.xlabel('Actual Diamond Price')
plt.show()

#### Decision tree model

In [None]:
# Initialize the model.
dt = DecisionTreeRegressor()
    
# Fit the model
dt.fit(X_train, Y_train)
    
# Predict the values
dt_pred = dt.predict(X_test)


# Performance metrics
dt_metrics = ['Decision Tree',np.round(mean_squared_error(Y_test, dt_pred),2), np.round(mean_absolute_error(Y_test, dt_pred), 2), 
                np.round(mean_squared_error(Y_test, dt_pred, squared=False), 2), np.round(dt.score(X_test, Y_test),2)]

In [None]:
# Visualize the Decision tree regression on testing dataset
plt.scatter(Y_test, dt_pred)
plt.plot(Y_test, Y_test, color = 'r')
plt.ylabel('Predicted Diamond Price')
plt.xlabel('Actual Diamond Price')
plt.show()

#### Comparing performance metrix

In [None]:
metrics = pd.DataFrame([linear_metrics, ridge_metrics, lasso_metrics, knn_metrics, dt_metrics], 
                       columns=['Model', 'Mean square error', 'Mean Absolute Error', 'Mean Root Squared Error', 'Score'])

In [None]:
metrics

##### From the above comparision we can conclude that Decision tree is more efficient to predict the price of diamonds as it has the highest score and lowest errors