In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import category_encoders as ce
from sklearn.compose import make_column_transformer
pd.set_option('display.max_columns', None) 

In [2]:
diamonds_test = pd.read_csv('diamonds_test.csv')
diamonds_test

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [3]:
column_order = ['id','city', 'cut','color', 'clarity','carat','depth','table','x', 'y', 'z']
diamonds_test = diamonds_test[column_order]
diamonds_test.head()

Unnamed: 0,id,city,cut,color,clarity,carat,depth,table,x,y,z
0,0,Amsterdam,Very Good,F,SI1,0.79,62.7,60.0,5.82,5.89,3.67
1,1,Surat,Ideal,J,VS1,1.2,61.0,57.0,6.81,6.89,4.18
2,2,Kimberly,Premium,H,SI1,1.57,62.2,61.0,7.38,7.32,4.57
3,3,Kimberly,Very Good,F,SI1,0.9,63.8,54.0,6.09,6.13,3.9
4,4,Amsterdam,Very Good,F,VS1,0.5,62.9,58.0,5.05,5.09,3.19


In [4]:
cat_diamonds_test = ['city', 'cut', 'color', 'clarity']
cat_diamonds_test = diamonds_test[['city', 'cut', 'color', 'clarity']]
lab_enc = LabelEncoder()
cat_lab_enc = {}

for column in cat_diamonds_test.columns:
    cat_lab_enc[column] = lab_enc.fit_transform(diamonds_test[column])

cat_lab_enc_df = pd.DataFrame(cat_lab_enc)
cat_lab_enc_df

Unnamed: 0,city,cut,color,clarity
0,0,4,2,2
1,10,2,6,4
2,3,3,4,2
3,3,4,2,2
4,0,4,2,4
...,...,...,...,...
13480,0,2,1,2
13481,8,2,5,5
13482,11,2,2,4
13483,10,4,2,3


In [6]:
diamonds_test = pd.concat([cat_lab_enc_df, diamonds_test.drop(columns=['city', 'cut', 'color', 'clarity'])], axis=1)
diamonds_test.insert(0, 'id', diamonds_test.pop('id'))

In [7]:
diamonds_test

Unnamed: 0,id,city,cut,color,clarity,carat,depth,table,x,y,z
0,0,0,4,2,2,0.79,62.7,60.0,5.82,5.89,3.67
1,1,10,2,6,4,1.20,61.0,57.0,6.81,6.89,4.18
2,2,3,3,4,2,1.57,62.2,61.0,7.38,7.32,4.57
3,3,3,4,2,2,0.90,63.8,54.0,6.09,6.13,3.90
4,4,0,4,2,4,0.50,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0,2,1,2,0.57,61.9,56.0,5.35,5.32,3.30
13481,13481,8,2,5,5,0.71,62.2,55.0,5.71,5.73,3.56
13482,13482,11,2,2,4,0.70,61.6,55.0,5.75,5.71,3.53
13483,13483,10,4,2,3,0.70,58.8,57.0,5.85,5.89,3.45


In [8]:
from pathlib import Path
filepath = Path('diamonds_test.csv', index=False)  
filepath.parent.mkdir(parents=True, exist_ok=True) 
diamonds_test.to_csv(filepath, index=False, encoding='utf-8')