# Diamonds Price Prediction 

#### Steps involved in Data Preprocessing

###### 1. Importing libraries
###### 2. Loading the dataset
###### 3. Finding Missing Data
###### 4. Data Visualization
###### 5. Identifying and removing outliers
###### 6. Encoding Categorical Data
###### 7. Model Building


 #### 1) Importing Libraries

In [1]:
import pandas as pd
import numpy as np

# Modelling Algorithms :



# Regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor 
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


# Modelling Helpers :
from sklearn.preprocessing import Normalizer , StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score


#preprocessing :
from sklearn.preprocessing import MinMaxScaler , StandardScaler, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder


#evaluation metrics :

# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 
from sklearn.metrics import r2_score

# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score 


# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns


#### 2) Loading Data

In [2]:
diamonds = pd.read_csv('diamonds.csv')
diamonds

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...,...
53935,53936,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,53937,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,53938,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,53939,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


#### 2.1) Features Description

    Carat : Carat weight of the Diamond.
    Cut : Describe cut quality of the diamond.

            Quality in increasing order Fair, Good, Very Good, Premium, Ideal .

    Color : Color of the Diamond.

            With D being the best and J the worst.

    Clarity : Diamond Clarity refers to the absence of the Inclusions and Blemishes.

            (In order from Best to Worst, FL = flawless, I3= level 3 inclusions) FL, IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3

    Depth : The Height of a Diamond, measured from the Culet to the table, divided by its average Girdle Diameter.
    Table : The Width of the Diamond's Table expressed as a Percentage of its Average Diameter.
    Price : the Price of the Diamond.
    X : Length of the Diamond in mm.
    Y : Width of the Diamond in mm.
    Z : Height of the Diamond in mm.

Qualitative Features (Categorical) : Cut, Color, Clarity.

Quantitative Features (Numerical) : Carat, Depth , Table , Price , X , Y, Z.
Price is the Target Variable.

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread(r'C:\Users\Reyad\Desktop\diamonds.jpg')

plt.imshow(img)
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Reyad\\Desktop\\diamonds.jpg'

#### 2.2)  Drop the 'Unnamed: 0' column as we already have Index

In [None]:
diamonds.drop(['Unnamed: 0'] , axis=1 , inplace=True)

In [None]:
diamonds.info()

In [None]:
diamonds.head()

#### 3) Finding Missing Data

In [None]:
diamonds.isna().sum()

#### 4) Data Visualization

In [None]:
diamonds.hist(figsize=(20,20))

In [None]:
sns.pairplot(diamonds)

In [None]:
ax=sns.pairplot(diamonds, hue= "cut")

In [None]:
sns.jointplot(x='carat' , y='price' , data=diamonds , size=5)

In [None]:
sns.jointplot(x='depth' , y='price' , data=diamonds , size=5)

In [None]:
diamonds['depth'].skew()

In [None]:
diamonds['depth'].std()
diamonds['depth'].mean()

In [None]:
sns.jointplot(x='table' , y='price' , data=diamonds , size=5)

In [None]:
sns.jointplot(x='x' , y='price' , data=diamonds , size=5)

In [None]:
sns.jointplot(x='y' , y='price' , data=diamonds , size=5)

In [None]:
sns.jointplot(x='z' , y='price' , data=diamonds , size=5)

In [None]:
sns.catplot(x='cut',y='price' ,data=diamonds ,kind='bar', aspect=2.5 )

In [None]:
sns.catplot(x='color',y='price' ,data=diamonds ,kind='bar',legend =True, aspect=2.5 )

In [None]:
sns.catplot(x='clarity',y='price' ,data=diamonds ,kind='bar',legend=True, aspect=2.5 )

In [None]:
sns.heatmap(diamonds.corr(), annot=True, cmap='RdBu_r')
plt.show()

#### 5. Identifying and removing outliers

In [None]:
sns.boxplot(data=diamonds[['carat']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds[['table']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds[['depth']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds[['x']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds[['y']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds[['z']], orient="h", palette="Set2")
plt.show()

In [None]:
diamonds.loc[(diamonds['x']==0) | (diamonds['y']==0) | (diamonds['z']==0)]


#### Points to notice:

Min value of "x", "y", "z" are zero this indicates that there are faulty values in data that represents dimensionless or 2-dimensional diamonds. So we need to filter out those as it clearly faulty data points

In [None]:
len(diamonds.loc[(diamonds['x']==0) | (diamonds['y']==0) | (diamonds['z']==0)])

In [None]:
diamonds.shape

In [None]:
# Make copy to avoid changing original data
diamonds_data = diamonds.copy()

In [None]:
diamonds_data=diamonds_data[(diamonds_data[["x","y","z"]]!=0).all(axis=1)]

In [None]:
diamonds_data.shape

In [None]:
diamonds_data['volume'] = diamonds_data['x']*diamonds_data['y']*diamonds_data['z']

In [None]:
diamonds_data.drop(['x','y','z'], axis=1, inplace= True)
diamonds_data.head()

In [None]:
sns.boxplot(data=diamonds_data[['volume']],orient="h", palette="Set2")
plt.show()

In [None]:
diamonds['depth'].skew()

In [None]:
diamonds_data.describe()

In [None]:
diamonds_data.corr()

In [None]:
sns.heatmap(diamonds_data.corr(), annot=True, cmap='RdBu_r')
plt.show()

In [None]:
object_cols = [col for col in diamonds_data.columns if diamonds_data[col].dtype in ['object']]
object_cols

In [None]:
# Get list of categorical variables
#obj = (diamonds_data.dtypes =="object")
#object_cols = list(obj[obj].index)
#print("Categorical variables:")
#print(object_cols)

In [None]:
numerical_cols = [cname for cname in diamonds_data.columns if diamonds_data[cname].dtype in ['int64', 'float64']]
numerical_cols

In [None]:
#diamonds_data.boxplot(numerical_cols)

In [None]:
diamonds_data[diamonds_data['table'].isnull()]

In [None]:
numerical_cols.remove('price')

In [None]:
numerical_cols

In [None]:
Q1 = diamonds_data[numerical_cols].quantile(0.25)
Q3 = diamonds_data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
#print(diamonds_data[numerical_cols] < (Q1 - 1.5 * IQR)) |(diamonds_data[numerical_cols] > (Q3 + 1.5 * IQR))

In [None]:
diamonds_data = diamonds_data[~((diamonds_data[numerical_cols] < (Q1 - 1.5 * IQR)) |(diamonds_data[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
diamonds_data.shape

In [None]:
sns.boxplot(data=diamonds_data[['depth']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds_data[['table']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds_data[['volume']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds_data[['depth']], orient="h", palette="Set2")
plt.show()

In [None]:
sns.boxplot(data=diamonds_data[['carat']], orient="h", palette="Set2")
plt.show()

In [None]:
diamonds_data.isnull().sum()

#### 6) Encoding Categorical Data

In [None]:
label_encoder = LabelEncoder()
for col in object_cols:
    diamonds_data[col] = label_encoder.fit_transform(diamonds_data[col])
diamonds_data.head()

In [None]:
diamonds_data.describe()

#### 7) Model Building

Steps involved in Model Building

    Setting up features and target
    Build a pipeline of standard scalar and model for Three different regressors.
    Fit all the models on training data
    Get mean of cross-validation on the training set for all the models for mean_absolute_error
    Pick the model with the best cross-validation score
    


In [None]:
X= diamonds_data.drop(["price"],axis =1)
y= diamonds_data["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=0)

In [None]:
diamonds_data

In [None]:

from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

In [None]:
pipeline_lnr=Pipeline([("scalar1",StandardScaler()),
                     ("lr_Regression",LinearRegression())])

pipeline_dtr=Pipeline([("scalar2",StandardScaler()),
                     ("dt_Regression",DecisionTreeRegressor())])

pipeline_rfr=Pipeline([("scalar3",StandardScaler()),
                     ("rf_Regression",RandomForestRegressor())])



In [None]:
pipelines = [pipeline_lnr, pipeline_dtr, pipeline_rfr]

In [None]:
results_mae = []
results_mse = []
results_cvs =[]
results_R2=[]
for pipe in pipelines:
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    score1 = mean_absolute_error(y_test, preds)
    score2=mean_squared_error(y_test,preds)
    # Multiply by -1 since sklearn calculates *negative* MAE
    scores3 =  -1* cross_val_score(pipe, X, y,cv=5)
    scores4=r2_score(y_test,preds)
    results_cvs.append(scores3)
    results_mae.append(score1)
    results_mse.append(score2)
    results_R2.append(scores4)
# Evaluate the model
#print(pipelines[pipe])
                                   


In [None]:

print('MAE:Liner Regression',results_mae[0] , ' \n MAE:Decision Tree Regression',results_mae[1],  '\n MAE:Random Forest',results_mae[2])
print('MSE:,Liner Regression',results_mse[0] , ' \n MAE:Decision Tree Regression',results_mse[1],  ' \n MAE:Random Forest',results_mse[2])
print('CVS:Liner Regression',results_cvs[0] , ' \n CSV:Decision Tree Regression',results_cvs[1], '\n MAE:Random Forest',results_cvs[2])
print('R2:Liner Regression',results_R2[0] ,' \n R2:Decision Tree Regression',results_R2[1] ,'\n R2:Random Forest',results_R2[2])

In [None]:
RFR=RandomForestRegressor(bootstrap= True,max_depth=5,max_features='auto',
 min_samples_leaf= 1,
 min_samples_split=2,
 n_estimators= 85)
RFR.fit(X_train,y_train) 
y_preds=RFR.predict(X_test)

In [None]:
print("MAE : ",mean_absolute_error(y_test,y_preds))
print("MSE : ",mean_squared_error(y_test,y_preds))

In [None]:
RFR_score = cross_val_score(RFR, X_train, y_train, cv = 5)
print("mean cross validation score: {:.2f} %".format(np.mean(RFR_score)))
print("score without cv: {}".format(RFR.score(X_train, y_train)))

# on the test or hold-out set
print(r2_score(y_test, RFR.predict(X_test)))

In [None]:
# Number of trees in random forest
n_estimator =[int(x) for x in np.linspace(start=100,stop=300,num=10,)]
print(n_estimator)# Number of features considered at every split
max_features=['auto','sqrt']
# Maximum number of level in tree
max_depth=[5,10]
# Minimum number of samples required to split anode
min_samples_split=[2,5]
# Minimum number of samples required at each leaf node
min_samples_leaf=[1,2]
# Method of selecting samples for training each tree
bootstrap=[True,False]

In [None]:
#Create the param. grid
param_grid={'n_estimators':n_estimator,
            'max_features':max_features,
            'max_depth':max_depth,
            'min_samples_split':min_samples_split,
            'min_samples_leaf':min_samples_leaf,
            'bootstrap':bootstrap
                               }
print(param_grid)

In [None]:
RFR=RandomForestRegressor()

In [None]:
RFCgridsearch=GridSearchCV(estimator=RFR,param_grid=param_grid,cv=5,verbose=2,n_jobs=4)
RFCgridsearch.fit(X_train,y_train)

In [None]:
RFCgridsearch.best_score_

In [None]:
RFCgridsearch.best_params_

In [None]:
RFCgridsearch.best_estimator_

In [None]:
print("r2_score : ",r2_score(y_test,RFCgridsearch.best_estimator_.predict(X_test)))

In [None]:
print("\n MAE : ",mean_absolute_error(y_test,RFCgridsearch.best_estimator_.predict(X_test)))

In [None]:
print("\n MSE : ",mean_squared_error(y_test,RFCgridsearch.best_estimator_.predict(X_test)))

In [None]:
print("\n Accuracy_score for Predictions : ",Accuracy_score(y_test,RFCgridsearch.best_estimator_.predict(X_test)))

In [None]:
print("\n Accuracy_score for Training : ",Accuracy_score(y_train,RFCgridsearch.best_estimator_.predict(X_train)))

In [None]:
print("\n Accuracy_score for Training : ",Accuracy_score(y_train,X_train))