## Libraries Imports

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor
import patsy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import statsmodels.api as sm
import statsmodels.formula.api as smf
%config InlineBackend.figure_format = 'png'

## Introduction
we will address the subject of gemstones, specifically the stone of cubic zirconia

## The first step: EDA


## Load Data


In [None]:
#reading the data
df= pd.read_csv('cubic_zirconia.csv',index_col=0) 

## Data Cleaning

In [None]:
#show first 5 row
df.head()

In [None]:
#show number of rows and columns
print("DataSet dimension     :",df.shape)

In [None]:
#summary statistic 
df.describe()

In [None]:
#show information
df.info()

In [None]:
#show all null value in columns
df.isnull().sum()

In [None]:
#know the type of data
df.dtypes

In [None]:
# show ten row randomly
df.sample(10)

In [None]:
#drop depth column
df.drop(['depth'], axis=1, inplace=True)

In [None]:
#drop table column
df.drop(['table'], axis=1, inplace=True)

In [None]:
# rename columns
df=df.rename(columns={'carat': 'Carat', 'cut': 'Cut', 'color': 'Color',
                                  'clarity': 'Clarity', 'price': 'Price',
                                  'x': 'Length', 'y': 'Width', 'z': 'Height'})

In [None]:
#show first 5 row
df.head()

## Note that we have Five columns containing outlier 

In [None]:
#Show outlier in Price column
sns.boxplot(df['Price']).set_title("Price before treating Outlier");

In [None]:
#Show outlier in Height column
sns.boxplot(df['Height']).set_title("Height before treating Outlier");

In [None]:
#Show outlier in Width column
sns.boxplot(df['Width']).set_title("Width before treating Outlier");

In [None]:
#Show outlier in Length column
sns.boxplot(df['Length']).set_title("Length before treating Outlier");

In [None]:
#Show outlier in Carat column
sns.boxplot(df['Carat']).set_title("Carat before treating Outlier");

In [None]:
# replace all 0 value to null value 
df = df.replace(0,np.nan)

In [None]:
#show information
df.info()

In [None]:
#show all null value in columns
df.isnull().sum()

In [None]:
# Show just cloumns names
df.columns

In [None]:
#put in all null value in length column mean 
df['Length'].fillna(df['Length'].mean(),inplace=True)

In [None]:
#put in all null value in width column mean 
df['Width'].fillna(df['Width'].mean(),inplace=True)

In [None]:
#put in all null value in Height column mean 
df['Height'].fillna(df['Height'].mean(),inplace=True)

In [None]:
#show all null value in columns
df.isnull().sum()

In [None]:
#show information
df.info()

In [None]:
#summary statistic 
df.describe()

## Outlier problem is solved

In [None]:
# This statement solved outlier problem in Height column
index = df[(df['Height'] >= 5)|(df['Height'] <= 2)].index 
df.drop(index, inplace=True) 

In [None]:
#Show outlier in Height column
sns.boxplot(df["Height"]).set_title("Height after treating Outlier");# s

In [None]:
# This statement solved outlier problem in Width column
index = df[(df['Width'] >= 7)|(df['Width'] <= 2)].index
df.drop(index, inplace=True)

In [None]:
#Show outlier in Width column
sns.boxplot(df['Width']).set_title("Width after treating Outlier");

In [None]:
# This statement solved outlier problem in Length column
index = df[(df['Length'] >= 7) |(df['Length'] <= 4)].index
df.drop(index, inplace=True)

In [None]:
#Show outlier in Length column
sns.boxplot(df['Length']).set_title("Length after treating Outlier");

In [None]:
# This statement solved outlier problem in Carat column
index = df[(df['Carat'] >= 1.5)].index
df.drop(index, inplace=True)

In [None]:
#Show outlier in Carat column
sns.boxplot(df['Carat']).set_title("Carat after treating Outlier");

In [None]:
# This statement solved outlier problem in Price column
index = df[(df['Price'] >= 6500)].index
df.drop(index, inplace=True)

In [None]:
#Show outlier in Price column
sns.boxplot(df['Price']).set_title("Price after treating Outlier");

In [None]:
#This statement to show the correlation for dataframe
df.corr() 

In [None]:
# Plot all of the variable-to-variable relations 
sns.pairplot(df);

In [None]:
sns.heatmap(df.corr(),annot=True); #This statement to show the correlation for dataframe

## Solve the Questions:

## Q1: Which color is the best of selling stone?
 

In [None]:
# show all color values and repetition
DF=df['Color'].value_counts()
DF

In [None]:
# show all color values
DF=df["Color"].value_counts()
DF.index

In [None]:
plt.figure(figsize=[8,8])#This statement for size of the plot 
plt.pie(df['Color'].value_counts(),# This statement for create a  plot 
labels=df['Color'].value_counts().index,
startangle=90,
colors=['#778899', '#D3D3D3', '#FFDEAD', '#FFE4E1', '#FFEFD5','#BC8F8F','#C0C0C0'],#This statement for plot color 
explode=[0.05,0.05,0.05,0.05,0.05,0.05,0.2],
shadow=True, autopct='%1.1f%%');
plt.title("Color best-selling stone",fontsize =14,weight = 'bold');#This statement for write in Title 
plt.savefig('SDAIA.png')

## Q2: What is the best clarity of selling stone?



In [None]:
# show all clarity values and repetition
DF=df['Clarity'].value_counts()
DF

In [None]:
# show all clarity values
DF=df["Clarity"].value_counts()
DF.index

In [None]:
plt.figure(figsize=[7,7])#This statement for size of the plot 
sns.countplot(data= df ,x = "Clarity" , order = DF.index ,palette="Set2");# This statement for create a  plot 
plt.title('stone clarity most demanding ',fontsize = 14, weight = 'bold',color='black');#This statement for write in Title
plt.savefig('SDAIA.png')

# Q3: Does the quality of stone affect its price?


In [None]:
quality_stone=df.groupby("Cut")[["Price"]].sum().reset_index().sort_values("Price",ascending=False) #This statement for show affect quality on price
quality_stone

In [None]:
plt.figure(figsize = [10,10]) #This statement for size of the chart
sns.lineplot(data=quality_stone,x="Cut",y="Price",color="#778899") # This statement for create a chart and modify
plt.title("The effect of the quality of the stone on Price",fontsize =14, weight = 'bold'); #This statement for write in Title and modify
plt.grid() #This statement for drow on the chart by line
plt.scatter(quality_stone['Cut'],quality_stone['Price'],c='#778899' ); #This statement for create a chart and modify
plt.rc('xtick',labelsize=13) #This statement for size xticks
plt.rc('ytick',labelsize=13) #This statement for size yticks
plt.savefig('SDAIA.png')

## Q4: What is the highest and lowest value of the stone based on its price?


In [None]:
print("The largest weight of cubic zirconia is",df['Carat'].max())# show the maximum value in carat
print("its Price is estimated at",df['Price'].max())# show the maximum value in price
print("\n")
print("The smallest weight of cubic zirconia is",df['Carat'].min())# show the minimum value in carat
print("its Price is estimated at",df['Price'].min())# show the minimum value in carat

In [None]:
# show relationship of the weight of the stone with its price
DF=df.groupby("Carat",as_index=False)[["Price"]].mean()
DF

In [None]:
plt.figure(figsize = [10,10])#This statement for size of the plot
plt.title("Relationship of the weight of the stone with its price",fontsize =14 , weight = 'bold');#This statement for write in Title
sns.lineplot(data=DF,x='Carat',y='Price',color='#B22222');# This statement for create a  plot 
plt.grid() # This statement to add grid lines to the plot
plt.savefig('SDAIA.png')

## Feature Engineering 

In [None]:
df['the_size']=df['Length']*df['Width']*df['Height'] # creat new feature from existing feature
df['the_size']

In [None]:
df.head()

In [None]:
df.drop(['Length'], axis=1, inplace=True) # drop length column

In [None]:
df.drop(['Width'], axis=1, inplace=True) # drop width column

In [None]:
df.drop(['Height'], axis=1, inplace=True) # drop height column

In [None]:
df.corr() # view coloration

## Feature Selection

In [None]:
pd.get_dummies(df['Cut'], drop_first=True).head(5) # transformation cut column to int 

In [None]:
pd.get_dummies(df['Color'], drop_first=True).head(5) # transformation color column to int

In [None]:
pd.get_dummies(df['Clarity'], drop_first=True).head(5) # transformation Clarity column to int

In [None]:
df = pd.get_dummies(df,drop_first=True) # apply it to the whole df
df.head(3)

## Split Data

In [None]:
#Separate our features from our target
X = df.loc[:, df.columns != 'Price']
X.columns
y = df['Price']
# Separate dataset into train and test 
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.25, random_state=43)
X

## Apply ML Model 

We used 4 MODELS

## Linear Regression

In [None]:
lm = LinearRegression() # linear regression model

lm.fit(X_train, y_train) # apply model
print(f'Linear Regression val R^2: {lm.score(X_val, y_val):.4f}') # show R^2 value train 
print("\n")
lm.fit(X,y) # apply model
print(f'Linear Regression test R^2: {lm.score(X_test, y_test):.4f}') # show R^2 value test 

## Comparison Between Actual and predict value

In [None]:
y_haat=lm.predict(X_test) # value Predictin 
y_haat

In [None]:
y_test # value actual 

## Comparison Between Actual and predict value by plot

In [None]:
plt.figure(figsize = [12,6])
sns.set_style("white")
x_plt_range = [y.min(),y.max()]
y_plt_range = [y_haat.min(),y_haat.max()]
plt.grid()
plt.scatter(y_test,y_haat,alpha=0.6,color='#DEB887',s=75)
plt.plot(x_plt_range,y_plt_range,c="r")
plt.title("ACTUAL AND PREDICT VALUE IN LINEAR REGRESSION MODEL ",fontsize = 20, weight = 'bold')
plt.xlabel("ACTUAL ",fontsize = 17, weight = 'bold')
plt.ylabel('PREDICT',fontsize = 17, weight = 'bold');
plt.savefig('SDAIA.png')

## Ridge Regression

In [None]:
scaler = StandardScaler() # Standard Scaler regression model

X_train_scaled = scaler.fit_transform(X_train.values)  # apply model
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

lm_reg = Ridge() # ridge regression model

lm_reg.fit(X_train_scaled, y_train) # apply model
print(f'Ridge Regression val R^2: {lm_reg.score(X_val_scaled, y_val):.4f}') # show R^2 value train 
print('\n')
print(f'Ridge Regression test R^2: {lm_reg.score(X_test_scaled, y_test):.4f}') # show R^2 value test

## Comparison Between Actual and predict value

In [None]:
y_haaat=lm_reg.predict(X_test) # value Predictin 
y_haaat 

In [None]:
y_test # value actual 

## Comparison Between Actual and predict value by plot

In [None]:
plt.figure(figsize = [12,6])
sns.set_style("white")
x_plt_range = [y.min(),y.max()]
y_plt_range = [y_haaat.min(),y_haaat.max()]
plt.grid()
plt.scatter(y_test,y_haaat,alpha=0.6,color='#DEB887',s=75)
plt.plot(x_plt_range,y_plt_range,c="r")
plt.title("ACTUAL AND PREDICT VALUE IN RIDGE REGRESSION MODEL ",fontsize = 20, weight = 'bold')
plt.xlabel("ACTUAL ",fontsize = 17, weight = 'bold')
plt.ylabel('PREDICT',fontsize = 17, weight = 'bold');
plt.savefig('SDAIA.png')

## polynomial Regression 

In [None]:
poly = PolynomialFeatures(degree=2) 
X_train_poly = poly.fit_transform(X_train.values)
X_val_poly = poly.transform(X_val.values)
X_test_poly = poly.transform(X_test.values)
lm_poly = LinearRegression() # polynomial regression model
lm_poly.fit(X_train_poly, y_train)  # apply model
print(f'Degree 2 polynomial regression val R^2: {lm_poly.score(X_val_poly, y_val):.4f}')  # show R^2 value train
print('\n')
print(f'polynomial regression test R^2: {lm_poly.score(X_test_poly, y_test):.4f}')  # show R^2 value test

## Comparison Between Actual and predict value

In [None]:
y_ht=lm_poly.predict(X_test_poly) # value Predictin 
y_ht

In [None]:
y_test # value actual 

## Comparison Between Actual and predict value by plot

In [None]:
plt.figure(figsize = [12,6])
sns.set_style("white")
x_plt_range = [y.min(),y.max()]
y_plt_range = [y_ht.min(),y_ht.max()]
plt.grid()
plt.scatter(y_test,y_ht,alpha=0.6,color='#DEB887',s=75)
plt.plot(x_plt_range,y_plt_range,c="r")
plt.title("ACTUAL AND PREDICT VALUE IN POLYNOMIAL REGRESSION MODEL ",fontsize = 20, weight = 'bold')
plt.xlabel("ACTUAL ",fontsize = 17, weight = 'bold')
plt.ylabel('PREDICT',fontsize = 17, weight = 'bold');
plt.savefig('SDAIA.png')

## Decision Tree Regressor

In [None]:
regressor = DecisionTreeRegressor() # Decision Tree Regressor model
regressor.fit(X_train, y_train)  # apply model
print(f'Decision tree val R^2: {lm.score(X_val, y_val):.4f}')  # show R^2 value train
print('\n')
print(f'Decision tree test R^2: {lm.score(X_test, y_test):.4f}')  # show R^2 value test

### Comparison Between Actual and predict value

In [None]:
y_hat=regressor.predict(X_test) # value Predictin
y_hat

In [None]:
y_test # value actual

### Comparison Between Actual and predict value by plot

In [None]:
plt.figure(figsize = [12,6])
sns.set_style("white")
x_plt_range = [y.min(),y.max()]
y_plt_range = [y_hat.min(),y_hat.max()]
plt.grid()
plt.scatter(y_test,y_hat,alpha=0.6,color='#DEB887',s=75)
plt.plot(x_plt_range,y_plt_range,c="r")
plt.title("ACTUAL AND PREDICT VALUE IN DECISION TREE REGRESSOR MODEL ",fontsize = 20, weight = 'bold')
plt.xlabel("ACTUAL ",fontsize = 17, weight = 'bold')
plt.ylabel('PREDICT',fontsize = 17, weight = 'bold');
plt.savefig('SDAIA.png')

## Best Model

In this project, We used four models and compared between them. Then, we came up with a  different and wonderful results, The best results for our models of data set was a model


R^2 train = 0.9675

R^2 train = 0.9681
