### Imports ### 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

### Loading Datasets ###

In [2]:
diamonds_train = pd.read_csv('../dataset/diamonds_train.csv', index_col=0)
diamonds_test = pd.read_csv('../dataset/diamonds_test.csv', index_col=0)

### Feature Engineering ###

In [3]:
# Creating new Features:
diamonds_train['area'] = diamonds_train['x'] * diamonds_train['y'] * diamonds_train['z']
diamonds_test['area'] = diamonds_test['x'] * diamonds_test['y'] * diamonds_test['z']

# Droping some columns:
columns = ['x','y','z','city']
diamonds_train.drop(columns, inplace=True, axis=1)
diamonds_test.drop(columns, inplace=True, axis=1)

### Encoding ###

In [4]:
# Label Encoding for categorical variables:
encod_cut = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
encod_color = {'Z': 1,'Y': 2,'X': 3,'V': 4,'U': 5,'T': 6,'S': 7,'R': 8,'Q': 9,'P': 10,'O': 11,'N': 12,\
               'M': 13,'L': 14,'K': 15,'J': 16, 'I': 17, 'H': 18, 'G': 19, 'F': 20, 'E': 21, 'D': 22}
encod_clarity = {'I3':1, 'I2':2, 'I1':3,'SI2':4, 'SI1':5, 'VS2':6, 'VS1':7, 'VVS2':8, 'VVS1':9, 'IF':10, 'FL':11}
encod_city = {'Tel Aviv':1, 'Antwerp':2, 'Luxembourg':3, 'Paris':4, 'Las Vegas':5, 'New York City':6, 'London':8,\
              'Surat':8, 'Madrid':9, 'Kimberly':10, 'Amsterdam':11, 'Dubai':12,'Zurich':9}
bins = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 5]
labels = ['0.5', '1', '1.5', '2', '2.5', '3', '3.5', '4', '5']

# Diamonds Train:
diamonds_train['cut'] = diamonds_train['cut'].map(encod_cut)
diamonds_train['color'] = diamonds_train['color'].map(encod_color)
diamonds_train['clarity'] = diamonds_train['clarity'].map(encod_clarity)
#diamonds_train['city'] = diamonds_train['city'].map(encod_city)
#diamonds_train['carat'] = pd.cut(diamonds_train['carat'], bins=bins, labels=labels).astype(float)

# One-hot encoding
#diamonds_train = pd.get_dummies(diamonds_train, columns=['city'])
#label_encoder = LabelEncoder()
#diamonds_train['city'] = label_encoder.fit_transform(diamonds_train['city'])

# Diammonds Test

# Label Encoding for categorical variables:
diamonds_test['cut'] = diamonds_test['cut'].map(encod_cut)
diamonds_test['color'] = diamonds_test['color'].map(encod_color)
diamonds_test['clarity'] = diamonds_test['clarity'].map(encod_clarity)
#diamonds_test['city'] = diamonds_test['city'].map(encod_city)
#diamonds_test['carat'] = pd.cut(diamonds_test['carat'], bins=bins, labels=labels).astype(float)

# One-hot encoding
#diamonds_test = pd.get_dummies(diamonds_test, columns=['city'])
#label_encoder = LabelEncoder()
#diamonds_train['city'] = label_encoder.fit_transform(diamonds_train['city'])

### Features + Target ###

In [5]:
# Features + Target
train_features = diamonds_train[['carat',
          'color',
          'cut',
          'clarity',
          'depth',
          'table',
          #'city',
          'area']]
y = diamonds_train['price']
print(train_features.shape,y.shape)

(40455, 7) (40455,)


### Scaling ###

In [6]:
#scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler = RobustScaler()


# Scaling Diamnonds Train 

scaled_train = scaler.fit_transform(train_features)
X = pd.DataFrame(scaled_train, columns=['carat',
          'color',
          'cut',
          'clarity',
          'depth',
          'table',
          #'city',
          'area'])

# Scaling Diamnonds Test

scaled_test = scaler.fit_transform(diamonds_test)
scaled_df_test = pd.DataFrame(scaled_test, columns=['carat',
          'color',
          'cut',
          'clarity',
          'depth',
          'table',
          #'city',
          'area'])

### Exporting Csv

In [7]:
# Converting to .csv
X.to_csv('../data/x_004.csv')
y.to_csv('../data/y_004.csv')
diamonds_test.to_csv('../data/x_test_004.csv')