In [2]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep8/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep8/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep8/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

#########################
## Feature Engineering ##
#########################

def updating_labels(df):
    
    df['clarity_scaled'] = df['clarity'].apply(lambda x: 0 if x == "IF" else 1 if x == "VVS1" else 2 if x == "VVS2" else 3 if x == "VS1" else 4 if x == "VS2" else 5 if x == "SI1" else 6 if x == "SI2" else 7)
    df['clarity_scaled'] = df['clarity_scaled'].astype(np.int8)
    
    df['cut_scaled'] = df['cut'].apply(lambda x: 0 if x == 'Fair' else 1 if x == 'Good' else 2 if x == 'Very Good' else 3 if x == 'Premium' else 4)                          
    df['cut_scaled'] = df['cut_scaled'].astype(np.int8) 
    
    df['color_scaled'] = df['color'].apply(lambda x: 0 if x == 'J' else 1 if x == 'I' else 2 if x == 'H' else 3 if x == 'G' else 4 if x == 'F' else 5 if x == 'E' else 6)
    df['color_scaled'] = df['color_scaled'].astype(np.int8)
    
    df.drop(columns = ['clarity', 'cut', 'color'], axis = 1, inplace = True)
    
    return df

train = updating_labels(train)
test = updating_labels(test)

# Splitting Duplicates

In [40]:
##########################
## Splitting Duplicates ##
##########################

train_dup = train.copy()
test_dup = test.copy()

to_consider = ['carat', 'depth', 'table', 'x', 'y', 'z', 'clarity_scaled', 'cut_scaled', 'color_scaled']

duplicates = pd.merge(train, test, on = to_consider)
train_dup_ids = duplicates['id_x'].tolist()
test_dup_ids = duplicates['id_y'].tolist()

train_clean = train[~np.isin(train['id'], train_dup_ids)].reset_index(drop = True)
train_dup = train[np.isin(train['id'], train_dup_ids)].reset_index(drop = True)

test_clean = test[~np.isin(test['id'], test_dup_ids)].reset_index(drop = True)
test_dup = test[np.isin(test['id'], test_dup_ids)].reset_index(drop = True)

dup_pred_price = pd.DataFrame(train_dup.groupby(['carat',
                                                 'depth',
                                                 'table',
                                                 'x',
                                                 'y',
                                                 'z',
                                                 'clarity_scaled',
                                                 'cut_scaled',
                                                 'color_scaled'])['price'].mean()).reset_index()
test_dup = pd.merge(test_dup, dup_pred_price, on = ['carat',
                                                    'depth',
                                                    'table',
                                                    'x',
                                                    'y',
                                                    'z',
                                                    'clarity_scaled',
                                                    'cut_scaled',
                                                    'color_scaled'], how = 'left')
test_dup = test_dup[['id', 'price']]

In [42]:
test_dup

Unnamed: 0,id,carat,depth,table,x,y,z,clarity_scaled,cut_scaled,color_scaled,price
0,193573,0.35,62.3,56.0,4.51,4.54,2.82,4,4,6,868.0
1,193622,0.30,62.3,60.0,4.28,4.24,2.65,4,3,4,605.0
2,193674,0.41,62.5,56.0,4.75,4.78,2.97,2,4,6,1168.0
3,193888,0.30,61.9,55.0,4.32,4.34,2.68,2,4,5,760.0
4,193910,0.72,61.9,56.0,5.74,5.77,3.57,6,4,5,2360.0
...,...,...,...,...,...,...,...,...,...,...,...
1513,322342,0.30,61.9,55.0,4.28,4.32,2.66,1,4,3,661.0
1514,322399,0.34,61.7,56.0,4.48,4.52,2.78,4,4,3,565.0
1515,322530,0.42,61.6,56.0,4.78,4.81,2.96,5,4,3,715.0
1516,322539,1.02,62.0,57.0,6.45,6.48,4.01,6,4,5,4588.0


In [38]:
dup_pred_price.head()

Unnamed: 0,carat,depth,table,x,y,z,clarity_scaled,cut_scaled,color_scaled,price
0,0.23,60.9,59.0,3.96,4.01,2.43,1,2,5,530.0
1,0.23,61.0,59.0,3.95,3.98,2.42,2,2,4,505.0
2,0.23,61.1,58.0,3.95,3.98,2.42,1,2,5,505.0
3,0.23,61.3,59.0,3.94,3.99,2.42,2,2,5,438.0
4,0.23,61.9,59.0,3.93,3.96,2.44,1,2,5,472.0


In [39]:
dup_pred_price.shape

(1470, 10)

In [13]:
train_dup.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,clarity_scaled,cut_scaled,color_scaled
0,99,0.52,61.7,57.0,5.14,5.17,3.18,1699,5,4,6
1,285,0.52,62.5,57.0,5.15,5.17,3.23,1777,4,4,5
2,437,0.55,61.8,56.0,5.26,5.3,3.26,1704,4,4,6
3,439,0.71,61.3,57.0,5.74,5.77,3.53,2964,4,4,5
4,441,0.54,61.2,56.0,5.25,5.28,3.22,1845,4,4,5


In [35]:
train_dup.columns.tolist()

['id',
 'carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'price',
 'clarity_scaled',
 'cut_scaled',
 'color_scaled']

In [14]:
train_dup.shape

(1546, 11)

In [15]:
test_dup.head()

Unnamed: 0,id,carat,depth,table,x,y,z,clarity_scaled,cut_scaled,color_scaled
0,193573,0.35,62.3,56.0,4.51,4.54,2.82,4,4,6
1,193622,0.3,62.3,60.0,4.28,4.24,2.65,4,3,4
2,193674,0.41,62.5,56.0,4.75,4.78,2.97,2,4,6
3,193888,0.3,61.9,55.0,4.32,4.34,2.68,2,4,5
4,193910,0.72,61.9,56.0,5.74,5.77,3.57,6,4,5


In [18]:
test_dup.columns.tolist()[1:11]

['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'clarity_scaled',
 'cut_scaled',
 'color_scaled']

In [22]:
test_dup_new = pd.merge(test_dup, train_dup.drop(columns = ['id'], axis = 1), on = test_dup.columns.tolist()[1:11], how = 'left')
test_dup_new.head()

Unnamed: 0,id,carat,depth,table,x,y,z,clarity_scaled,cut_scaled,color_scaled,price
0,193573,0.35,62.3,56.0,4.51,4.54,2.82,4,4,6,868
1,193622,0.3,62.3,60.0,4.28,4.24,2.65,4,3,4,605
2,193674,0.41,62.5,56.0,4.75,4.78,2.97,2,4,6,1168
3,193888,0.3,61.9,55.0,4.32,4.34,2.68,2,4,5,760
4,193910,0.72,61.9,56.0,5.74,5.77,3.57,6,4,5,2360


In [23]:
test_dup_new['price'].describe()

count     1598.000000
mean      1726.209011
std       2019.935541
min        394.000000
25%        726.000000
50%        959.000000
75%       1821.000000
max      18524.000000
Name: price, dtype: float64

In [26]:
test_dup.head()

Unnamed: 0,id,carat,depth,table,x,y,z,clarity_scaled,cut_scaled,color_scaled
0,193573,0.35,62.3,56.0,4.51,4.54,2.82,4,4,6
1,193622,0.3,62.3,60.0,4.28,4.24,2.65,4,3,4
2,193674,0.41,62.5,56.0,4.75,4.78,2.97,2,4,6
3,193888,0.3,61.9,55.0,4.32,4.34,2.68,2,4,5
4,193910,0.72,61.9,56.0,5.74,5.77,3.57,6,4,5


In [28]:
test_dup.shape

(1518, 10)

In [27]:
test_dup.drop(columns = 'id', axis = 1).drop_duplicates().shape

(1470, 9)

In [29]:
train_dup.shape

(1546, 11)

In [30]:
train_dup.drop(columns = ['id', 'price'], axis = 1).drop_duplicates().shape

(1470, 9)

In [33]:
train_dup.head(20)

Unnamed: 0,id,carat,depth,table,x,y,z,price,clarity_scaled,cut_scaled,color_scaled
0,99,0.52,61.7,57.0,5.14,5.17,3.18,1699,5,4,6
1,285,0.52,62.5,57.0,5.15,5.17,3.23,1777,4,4,5
2,437,0.55,61.8,56.0,5.26,5.3,3.26,1704,4,4,6
3,439,0.71,61.3,57.0,5.74,5.77,3.53,2964,4,4,5
4,441,0.54,61.2,56.0,5.25,5.28,3.22,1845,4,4,5
5,462,0.3,62.3,56.0,4.32,4.29,2.68,624,3,4,3
6,871,0.38,62.4,54.0,4.61,4.64,2.89,834,5,4,5
7,889,0.51,61.8,55.0,5.15,5.18,3.19,1687,5,4,6
8,1012,0.41,61.0,56.0,4.79,4.82,2.93,1061,4,4,3
9,1090,0.51,61.6,56.0,5.14,5.17,3.18,1832,3,4,4


# Optuna

In [17]:
#########################
## Optuna Optimization ##
#########################

print('-----------------------------')
print(' (-: Optuna has started :-) ')
print('-----------------------------')

X = train_clean.drop(columns = ['id', 'price'], axis = 1)
Y = train_clean['price']

test_lgb = test_clean.drop(columns = 'id', axis = 1)

class Objective:

    def __init__(self, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        param = dict(metric = 'rmse',
                     boosting_type = 'gbdt', 
                     n_estimators = trial.suggest_int('n_estimators', 300, 10000),
                     learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log = True),
                     max_depth = trial.suggest_int('max_depth', 3, 12),
                     lambda_l1 = trial.suggest_float('lambda_l1', 0.01, 10.0, log = True),
                     lambda_l2 = trial.suggest_float('lambda_l2', 0.01, 10.0, log = True),
                     num_leaves = trial.suggest_int('num_leaves', 2, 100),
                     bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9),
                     feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9)
#                      device = 'gpu'
                    )

        scores = []
        
        skf = KFold(n_splits = 5, shuffle = True, random_state = self.seed)

        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = LGBMRegressor(**param).fit(X_train, Y_train)

            preds_valid = model.predict(X_valid)

            score = mean_squared_error(Y_valid, preds_valid, squared = False)
            scores.append(score)

        return np.mean(scores)
    
## Defining SEED and Trials
SEED = 42
N_TRIALS = 3

# Execute an optimization
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-01 00:13:13,535][0m A new study created in memory with name: no-name-ace4945a-7aa8-4e59-b19f-54d5fff49dfb[0m


-----------------------------
 (-: Optuna has started :-) 
-----------------------------


[32m[I 2023-03-01 00:13:55,035][0m Trial 0 finished with value: 666.6167485263261 and parameters: {'n_estimators': 2514, 'learning_rate': 0.46120247202105497, 'max_depth': 10, 'lambda_l1': 1.2462719056359346, 'lambda_l2': 3.742171521280458, 'num_leaves': 46, 'bagging_fraction': 0.8038831879477915, 'feature_fraction': 0.31692088856188116}. Best is trial 0 with value: 666.6167485263261.[0m




[32m[I 2023-03-01 00:14:32,833][0m Trial 1 finished with value: 739.9969990366424 and parameters: {'n_estimators': 2261, 'learning_rate': 0.0012050728529599626, 'max_depth': 9, 'lambda_l1': 0.0940354636913015, 'lambda_l2': 1.2748131012599297, 'num_leaves': 24, 'bagging_fraction': 0.8324713484464408, 'feature_fraction': 0.517875855446392}. Best is trial 0 with value: 666.6167485263261.[0m




[32m[I 2023-03-01 00:16:58,332][0m Trial 2 finished with value: 620.993002269427 and parameters: {'n_estimators': 9803, 'learning_rate': 0.06477894159495362, 'max_depth': 6, 'lambda_l1': 0.06506823133950966, 'lambda_l2': 0.11402088956281181, 'num_leaves': 68, 'bagging_fraction': 0.7104576292174876, 'feature_fraction': 0.7076787155895647}. Best is trial 2 with value: 620.993002269427.[0m


In [34]:
submission.head()

Unnamed: 0,id,price
0,193573,3969.155
1,193574,3969.155
2,193575,3969.155
3,193576,3969.155
4,193577,3969.155
