In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib


In [3]:
# Set seed
np.random.seed(123)

# Import training data
trn = pd.read_csv('cleaned_train.csv')
X_tst = pd.read_csv('CW1_test.csv') # This does not include true outcomes (obviously)

*DATA TRANSFORMATION*

In [4]:
X_tst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 30 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
 10  a1       1000 non-null   float64
 11  a2       1000 non-null   float64
 12  a3       1000 non-null   float64
 13  a4       1000 non-null   float64
 14  a5       1000 non-null   float64
 15  b1       1000 non-null   float64
 16  b2       1000 non-null   float64
 17  b3       1000 non-null   float64
 18  b4       1000 non-null   float64
 19  b5       1000 non-null   float64
 20  a6       1000 non-null   float64
 21  a7       1000 n

In [None]:
X_tst.isnull().sum()   # Check for missing values

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
a1         0
a2         0
a3         0
a4         0
a5         0
b1         0
b2         0
b3         0
b4         0
b5         0
a6         0
a7         0
a8         0
a9         0
a10        0
b6         0
b7         0
b8         0
b9         0
b10        0
dtype: int64

In [None]:
from sklearn.preprocessing import OrdinalEncoder

if X_tst['cut'].dtype == 'object':   # Prevent error from showing if file had previously been transformed
    ord_encoder = OrdinalEncoder(
        categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']], dtype=int
    )

    # Fit and transform training data
    X_tst[['cut']] = ord_encoder.fit_transform(X_tst[['cut']])

In [7]:
from sklearn.preprocessing import OneHotEncoder

if 'color' in X_tst.columns and 'clarity' in X_tst.columns:
    # Define One-Hot Encoder with optimizations
    one_hot = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False, dtype=int)

    # List categorical columns to encode
    One_hot_cols = ['color', 'clarity']

    # Fit and transform training data
    one_hot_train = pd.DataFrame(one_hot.fit_transform(X_tst[One_hot_cols]), index=X_tst.index)

    # Assign column names
    one_hot_train.columns = one_hot.get_feature_names_out(One_hot_cols)

    # Drop original categorical columns
    X_tst = X_tst.drop(columns=One_hot_cols, axis=1)

    # Merge encoded features with numerical features
    X_tst = pd.concat([X_tst, one_hot_train], axis=1)


In [8]:
X_tst['price_per_carat'] = X_tst['price'] / X_tst['carat'] # Transform price and caraty

# Drop redundant columns
X_tst.drop(columns=['price', 'carat'], inplace=True, errors='ignore')

In [9]:
X_tst['xyz'] = X_tst['x'] * X_tst['y'] * X_tst['z']
X_tst.drop(columns=['x', 'y', 'z'], inplace=True)

In [10]:
X_tst.head()

Unnamed: 0,cut,depth,table,a1,a2,a3,a4,a5,b1,b2,...,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,price_per_carat,xyz
0,4,60.8,56.0,0.514454,0.043084,0.384089,0.386384,0.255561,0.800736,0.619799,...,0,0,0,0,0,0,0,1,2176.923077,65.205336
1,3,62.1,59.0,0.659248,0.910605,0.413347,0.515342,0.218166,0.429289,0.721886,...,0,0,0,0,1,0,0,0,5459.803922,163.273132
2,4,61.6,55.0,0.643283,0.8844,0.167175,0.892999,0.17448,0.231705,0.695765,...,0,0,1,0,0,0,0,0,3729.72973,121.8087
3,4,62.3,56.0,0.031548,0.767804,0.323843,0.891769,0.940242,0.691946,0.121764,...,0,0,0,0,0,0,0,1,3870.0,81.22622
4,1,63.8,58.0,0.907421,0.850364,0.205829,0.469989,0.671413,0.022987,0.050909,...,0,0,1,0,0,0,0,0,5544.230769,169.24554


In [13]:
# Load model
model = joblib.load("final_xgboost_model2.pkl") 

In [14]:
# Test set predictions
yhat_lm = model.predict(X_tst)

In [None]:
# Format submission:
# This is a single-column CSV with nothing but the predictions
out = pd.DataFrame({'yhat': yhat_lm})
out.to_csv('CW1_submission_k23086553.csv', index=False) 
