In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv("final_data.csv")
data = data.drop("Unnamed: 0", axis=1)

In [3]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,5,2,2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,4,2,3,59.8,61.0,326,3.89,3.84,2.31
2,0.23,2,2,5,56.9,65.0,327,4.05,4.07,2.31
3,0.29,4,6,4,62.4,58.0,334,4.2,4.23,2.63
4,0.31,2,7,2,63.3,58.0,335,4.34,4.35,2.75


### Train-Test Split 

In [4]:
x = data.drop("price", axis=1).values
y = data["price"].values
print(x,y)

[[0.23 5.   2.   ... 3.95 3.98 2.43]
 [0.21 4.   2.   ... 3.89 3.84 2.31]
 [0.23 2.   2.   ... 4.05 4.07 2.31]
 ...
 [0.7  3.   1.   ... 5.66 5.68 3.56]
 [0.86 4.   5.   ... 6.15 6.12 3.74]
 [0.75 5.   1.   ... 5.83 5.87 3.64]] [ 326  326  327 ... 2757 2757 2757]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.33, random_state=0)

In [6]:
print(f"x_train shape: {x_train.shape}\nx_test shape: {x_test.shape}\ny_train shape: {y_train.shape}\ny_test shape: {y_test.shape}")

x_train shape: (36041, 9)
x_test shape: (17753, 9)
y_train shape: (36041,)
y_test shape: (17753,)


### Scaling

In [7]:
scaler = StandardScaler()

In [8]:
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### Model Building

#### Linear Regression 

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
regLR = LinearRegression()

In [11]:
regLR.fit(x_train,y_train)

In [12]:
predictionLR = regLR.predict(x_test)

#### Polynomial Regression

In [13]:
from sklearn.preprocessing import PolynomialFeatures

In [14]:
regPoly = PolynomialFeatures(degree = 2)

In [15]:
xPoly = regPoly.fit_transform(x_train)

In [16]:
regLR.fit(xPoly,y_train)

In [17]:
predictionPoly = regLR.predict(regPoly.fit_transform(x_test))

#### Support Vector Regresssion

In [18]:
from sklearn.svm import SVR

In [19]:
regSVR = SVR(kernel = "linear")

In [20]:
regSVR.fit(x_train, y_train)

In [21]:
predictionSVR = regSVR.predict(x_test)

#### Decision Tree

In [22]:
from sklearn.tree import DecisionTreeRegressor

In [23]:
regDT = DecisionTreeRegressor(random_state=0)

In [24]:
regDT.fit(x_train,y_train)

In [25]:
predictionDT = regDT.predict(x_test)

#### Random Forest 

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
regRF = RandomForestRegressor(n_estimators=1000, random_state=0)

In [28]:
regRF.fit(x_train,y_train)

In [29]:
predictionRF = regRF.predict(x_test)

### Evaluation 

In [30]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

In [31]:
listForModels = ["Linear Regression", "Polynomial Regression", "Support Vector Regression", "Decision Tree", "Random Forest"]
listForPredictions = [predictionLR, predictionPoly, predictionSVR, predictionDT, predictionRF]

def comparisonModels(listForModels,listForPredictions,y_test):
    
    for modelName, predictions in zip(listForModels, listForPredictions):
        
        mae = round(mean_absolute_error(y_test,predictions), 1)
        r2 = round(r2_score(y_test,predictions), 2)
        mape = round(mean_absolute_percentage_error(y_test,predictions)*100, 1)
        
        
        print(f"Model Name: {modelName}")
        print(f"Mean Absoulute Error: {mae}")
        print(f"R-Squared Score: {r2}")
        print(f"Mean Absoulute Percentage Error: {mape}%")
        print("\n")

comparisonModels(listForModels,listForPredictions,y_test)

Model Name: Linear Regression
Mean Absoulute Error: 809.4
R-Squared Score: 0.91
Mean Absoulute Percentage Error: 43.6%


Model Name: Polynomial Regression
Mean Absoulute Error: 476.9
R-Squared Score: 0.81
Mean Absoulute Percentage Error: 19.8%


Model Name: Support Vector Regression
Mean Absoulute Error: 834.2
R-Squared Score: 0.86
Mean Absoulute Percentage Error: 31.0%


Model Name: Decision Tree
Mean Absoulute Error: 360.7
R-Squared Score: 0.97
Mean Absoulute Percentage Error: 8.6%


Model Name: Random Forest
Mean Absoulute Error: 267.9
R-Squared Score: 0.98
Mean Absoulute Percentage Error: 6.4%




Based on the data, it is observed that the best model is Random Forest. The Random Forest model outperforms the others with the lowest Mean Absolute Error and the highest R-Squared Score.