In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [None]:
data = pd.read_csv("diamonds.csv")
data.head()

In [None]:
data = data.drop(['Unnamed: 0'], axis=1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.plot()

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(10,12))
heat=sns.heatmap(data.corr(), annot=True,cmap='RdYlGn',square=True)  

In [None]:
p=sns.pairplot(data)

In [None]:
fig = plt.figure(figsize=(10,6))
sns.boxplot(data.table)
plt.title('Table Boxplot')
plt.xlabel("Table")
plt.show()

sns.boxplot(data.depth)
plt.title('depth Boxplot')
plt.xlabel("depth")
plt.show()

sns.boxplot(data.carat)
plt.title('carat Boxplot')
plt.xlabel("carat")
plt.show()

sns.boxplot(data.x)
plt.title('x Boxplot')
plt.xlabel("x")
plt.show()

sns.boxplot(data.y)
plt.title('y Boxplot')
plt.xlabel("y")
plt.show()

sns.boxplot(data.z)
plt.title('z Boxplot')
plt.xlabel("z")
plt.show()

sns.boxplot(data.price)
plt.title('price Boxplot')
plt.xlabel("price")
plt.show()



In [None]:
data.describe()

The minimum values for x,y and z here are 0 but it is not possible because according to the data description they are the length, width and depth

In [None]:
print("Number of rows with x == 0: {} ".format((data.x==0).sum()))
print("Number of rows with y == 0: {} ".format((data.y==0).sum()))
print("Number of rows with z == 0: {} ".format((data.z==0).sum()))
print("Number of rows with depth == 0: {} ".format((data.depth==0).sum()))

Here we have 0 which mean NA replacing these with NA .A zero value in these rows means missing data so we can replace the zeros with nan. Another thing to notice is that the depth column doesn't have a single zero value. We know that depth is calculated using the three parameters only.

In [None]:
data[['x','y','z']] = data[['x','y','z']].replace(0,np.NaN)

In [None]:
data.isnull().sum()


Since missing values are not much , dropping them will be easy

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

In [None]:
data.cut.unique()

In [None]:
data.color.unique()

In [None]:
data.clarity.unique()

In [None]:
col=data.columns
for i in col:
    if data[i].dtype==np.object:
        s=set(data[i])
        d={}
        for i1,j in enumerate(s):
            d[j]=i1
            
        data[i]=list(map(lambda k:d[k],data[i]))   

In [None]:
data.head()

In [None]:
plt.figure(figsize=(8,7))  
heat=sns.heatmap(data.corr(), annot=True,cmap='RdYlGn')

Cut, Color , Clarity are not much affecting price , whereas Price is corelated with carat , x, y, z

In [None]:
from sklearn.model_selection import train_test_split
x=data.drop(['price'],axis=1)
y=data['price']
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3,random_state=0)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

## Model Building

LINEAR REGRESSION 

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn import linear_model
import math

In [None]:
linear_reg = linear_model.LinearRegression()
linear_reg.fit(X_train,Y_train)
linear_Y_pred = linear_reg.predict(X_test)


print("accuracy: "+ str(linear_reg.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,linear_Y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,linear_Y_pred))))
R2 = r2_score(Y_test,linear_Y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))


RIDGE REGRESSION

In [None]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge()
ridge_reg.fit(X_train, Y_train)
ridge_Y_pred = ridge_reg.predict(X_test)
print("accuracy: "+ str(ridge_reg.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,ridge_Y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,ridge_Y_pred))))
R2 = r2_score(Y_test,ridge_Y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

LASSO REGRESSION

In [None]:
lasso_reg = linear_model.Lasso()
lasso_reg.fit(X_train,Y_train)
lasso_y_pred = lasso_reg.predict(X_test)
print("accuracy: "+ str(lasso_reg.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,lasso_y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,lasso_y_pred))))
R2 = r2_score(Y_test,lasso_y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

ElasticNet Regression 

In [None]:
elas_reg = linear_model.ElasticNet(alpha=0.1, l1_ratio=0.5)
elas_reg.fit(X_train,Y_train)
elas_y_pred = elas_reg.predict(X_test)

print("accuracy: "+ str(elas_reg.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,elas_y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,elas_y_pred))))
R2 = r2_score(Y_test,elas_y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1

adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

Principal Component Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

In [None]:
pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
pcr.fit(X_train, Y_train)
pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline
pcr_y_pred = pcr.predict(X_test)

print("accuracy: "+ str(pcr.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pcr_y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pcr_y_pred))))
R2 = r2_score(Y_test,pcr_y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PCR r-squared {pcr.score(X_test, Y_test):.3f}")

Partial Least Square Regression

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
pls = PLSRegression(n_components=1)
pls.fit(X_train, Y_train)
pls_y_pred = pls.predict(X_test)
print("accuracy: "+ str(pls.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pls_y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pls_y_pred))))
R2 = r2_score(Y_test,pls_y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PLS r-squared {pls.score(X_test, Y_test):.3f}")

POISSON REGRESSION

RANDOM FOREST REGRESSOR

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor( n_estimators = 15,random_state = 0)
rf.fit(X_train,Y_train)
rf_y_pred = rf.predict(X_test)
print("accuracy: "+ str(rf.score(X_test,Y_test)*100) + "%")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,rf_y_pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,rf_y_pred))))
R2 = r2_score(Y_test,rf_y_pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import cluster
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score, silhouette_samples, silhouette_score
import warnings
warnings.filterwarnings("ignore")

DECISION TREE REGRESSOR

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()

dtree.fit(X_train, Y_train)


y_pred = dtree.predict(X_test)

accuracy_dtree = cross_val_score(estimator = dtree, X = X_train, y = Y_train, cv = 8, verbose = 1)
r2_dtree = r2_score(Y_test, y_pred)
mse_dtree = mean_squared_error(Y_test, y_pred)

print(f'Accuracy is {accuracy_dtree}')
print(f'r2_Score is {r2_dtree}')
print(f'Root Mean Squared error is {math.sqrt(mse_dtree)}')
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,y_pred)))

KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
kn = KNeighborsRegressor(n_neighbors = 2)
kn.fit(X_train, Y_train)


y_pred = kn.predict(X_test)

accuracy_kn = cross_val_score(estimator = kn, X = X_train, y = Y_train, cv = 8, verbose = 1)
r2_kn = r2_score(Y_test, y_pred)
mse_kn = mean_squared_error(Y_test, y_pred)

print(f'Accuracy is {accuracy_kn}')
print(f'r2_Score is {r2_kn}')
print(f'Root Mean Squared error is {math.sqrt(mse_kn)}')
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,y_pred)))

GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model= GradientBoostingRegressor()
model.fit(X_train, Y_train)
pred=model.predict(X_test)
print("accuracy: "+ str(model.score(X_test,Y_test)*100) + " %")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pred))))
R2 = r2_score(Y_test,pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PLS r-squared {pls.score(X_test, Y_test):.3f}")

XGBoost Regressor

In [None]:
import xgboost as xgb
model=xgb.XGBRegressor()
model.fit(X_train, Y_train)
model.score(X_test,Y_test)
pred=model.predict(X_test)
print("accuracy: "+ str(model.score(X_test,Y_test)*100) + " %")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pred))))
R2 = r2_score(Y_test,pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PLS r-squared {pls.score(X_test, Y_test):.3f}")

Bagging Regressor

In [None]:
from sklearn import tree
from sklearn.ensemble import BaggingRegressor
model = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))
model.fit(X_train, Y_train)
model.score(X_test,Y_test)
pred=model.predict(X_test)
print("accuracy: "+ str(model.score(X_test,Y_test)*100) + " %")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pred))))
R2 = r2_score(Y_test,pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PLS r-squared {pls.score(X_test, Y_test):.3f}")

AdaBoost Regressor

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train, Y_train)
model.score(X_test,Y_test)
pred=model.predict(X_test)
print("accuracy: "+ str(model.score(X_test,Y_test)*100) + " %")
print("Mean absolute error: {}".format(mean_absolute_error(Y_test,pred)))
print("Root Mean squared error: {}".format(math.sqrt(mean_squared_error(Y_test,pred))))
R2 = r2_score(Y_test,pred)
print('R Squared: {}'.format(R2))
n=X_test.shape[0]
p=X_test.shape[1] - 1
adj_rsquared = 1 - (1 - R2) * ((n - 1)/(n-p-1))
print('Adjusted R Squared: {}'.format(adj_rsquared))
print(f"PLS r-squared {pls.score(X_test, Y_test):.3f}")

<table ><tr><th >S.No. <th><th> Algorithms <th><th> Accuracy (%) <tr><tr><table>
<tr><td> 1 <td><td> Linear Regression <td><td> 76.738 <td><tr><table>
 <tr><td> 2 <td><td> Ridge Regression <td><td> 77.199 <td><tr><table>
 <tr><td> 3 <td><td> Lasso Regression <td><td> 81.643 <td><tr><table>
 <tr><td> 4 <td><td> ElasticNet Regression <td><td> 83.713 <td><tr><table><table>
 <tr><td> 5 <td><td> Principal Component Regression <td><td> 79.122 <td><tr><table>
 <tr><td> 6 <td><td> Partial Least Square Regression <td><td> 79.568 <td><tr><table>
 <tr><td> 7 <td><td> Random Forest Regressor <td><td> 96.976 <td><tr><table>
 <tr><td> 8 <td><td> Decision Tree Regressor <td><td> 96.821 <td><tr><table>
 <tr><td> 9 <td><td> K-nearest neighbor Regressor <td><td> 94.745 <td><tr><table>
 <tr><td> 10 <td><td> Gradient Boosting Regressor <td><td> 95.643 <td><tr><table>
 <tr><td> 11 <td><td> XgBoost Regressor <td><td> 96.866 <td><tr><table>
 <tr><td> 12 <td><td> Bagging Regressor <td><td> 96.760 <td><tr><table>
 <tr><td> 13 <td><td> AdaBoost Regressor <td><td> 89.200 <td><tr><table>
   

Lightgbm

SUPPORT VECTOR REGRESSOR

X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.3,random_state=0)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('_mpl-gallery')

# make data:
x = 
y = np.random.uniform(2, 7, len(x))

# plot
fig, ax = plt.subplots()

ax.bar(x, y, width=1, edgecolor="white", linewidth=0.7)

ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
       ylim=(0, 8), yticks=np.arange(1, 8))

plt.show()