In [1]:
pip install xgboost lightgbm catboost optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.4-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.1.1-cp310-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz
  Down

In [2]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.feature_selection import RFE, RFECV
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/PS-S3/Ep8/train.csv'
file_key_2 = 'Tabular-Playground-Series/PS-S3/Ep8/test.csv'
file_key_3 = 'Tabular-Playground-Series/PS-S3/Ep8/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data files
train = pd.read_csv(file_content_stream_1)
test = pd.read_csv(file_content_stream_2)
submission = pd.read_csv(file_content_stream_3)

#########################
## Feature Engineering ##
#########################

def updating_labels(df):
    
    df['clarity_scaled'] = df['clarity'].apply(lambda x: 0 if x == "IF" else 1 if x == "VVS1" else 2 if x == "VVS2" else 3 if x == "VS1" else 4 if x == "VS2" else 5 if x == "SI1" else 6 if x == "SI2" else 7)
    df['clarity_scaled'] = df['clarity_scaled'].astype(np.int8)
    
    df['cut_scaled'] = df['cut'].apply(lambda x: 0 if x == 'Fair' else 1 if x == 'Good' else 2 if x == 'Very Good' else 3 if x == 'Premium' else 4)                          
    df['cut_scaled'] = df['cut_scaled'].astype(np.int8) 
    
    df['color_scaled'] = df['color'].apply(lambda x: 0 if x == 'J' else 1 if x == 'I' else 2 if x == 'H' else 3 if x == 'G' else 4 if x == 'F' else 5 if x == 'E' else 6)
    df['color_scaled'] = df['color_scaled'].astype(np.int8)
    
    df.drop(columns = ['clarity', 'cut', 'color'], axis = 1, inplace = True)
    
    return df

train = updating_labels(train)
test = updating_labels(test)

Matplotlib is building the font cache; this may take a moment.


In [3]:
train.head()

Unnamed: 0,id,carat,depth,table,x,y,z,price,clarity_scaled,cut_scaled,color_scaled
0,0,1.52,62.2,58.0,7.27,7.33,4.55,13619,4,3,4
1,1,2.03,62.0,58.0,8.06,8.12,5.05,13387,6,2,0
2,2,0.7,61.2,57.0,5.69,5.73,3.5,2772,3,4,3
3,3,0.32,61.6,56.0,4.38,4.41,2.71,666,3,4,3
4,4,1.7,62.6,59.0,7.65,7.61,4.77,14453,4,3,3


In [4]:
train['clarity_scaled'].value_counts()

5    53272
4    48027
3    30669
6    30484
2    15762
1    10628
0     4219
7      512
Name: clarity_scaled, dtype: int64

In [5]:
test['clarity_scaled'].value_counts()

5    35336
4    32201
3    20519
6    20167
2    10317
1     7327
0     2791
7      392
Name: clarity_scaled, dtype: int64

# Splitting Duplicates

In [7]:
##########################
## Splitting Duplicates ##
##########################

train_dup = train.copy()
test_dup = test.copy()

to_consider = ['carat', 'depth', 'table', 'x', 'y', 'z', 'clarity_scaled', 'cut_scaled', 'color_scaled']

duplicates = pd.merge(train, test, on = to_consider)
train_dup_ids = duplicates['id_x'].tolist()
test_dup_ids = duplicates['id_y'].tolist()

train_clean = train[~np.isin(train['id'], train_dup_ids)].reset_index(drop = True)
train_dup = train[np.isin(train['id'], train_dup_ids)].reset_index(drop = True)

test_clean = test[~np.isin(test['id'], test_dup_ids)].reset_index(drop = True)
test_dup = test[np.isin(test['id'], test_dup_ids)].reset_index(drop = True)

# Optuna

In [17]:
#########################
## Optuna Optimization ##
#########################

print('-----------------------------')
print(' (-: Optuna has started :-) ')
print('-----------------------------')

X = train_clean.drop(columns = ['id', 'price'], axis = 1)
Y = train_clean['price']

test_lgb = test_clean.drop(columns = 'id', axis = 1)

class Objective:

    def __init__(self, seed):
        # Hold this implementation specific arguments as the fields of the class.
        self.seed = seed

    def __call__(self, trial):
        
        ## Parameters to be evaluated
        param = dict(metric = 'rmse',
                     boosting_type = 'gbdt', 
                     n_estimators = trial.suggest_int('n_estimators', 300, 10000),
                     learning_rate = trial.suggest_float('learning_rate', 0.001, 1, log = True),
                     max_depth = trial.suggest_int('max_depth', 3, 12),
                     lambda_l1 = trial.suggest_float('lambda_l1', 0.01, 10.0, log = True),
                     lambda_l2 = trial.suggest_float('lambda_l2', 0.01, 10.0, log = True),
                     num_leaves = trial.suggest_int('num_leaves', 2, 100),
                     bagging_fraction = trial.suggest_float('bagging_fraction', 0.2, 0.9),
                     feature_fraction = trial.suggest_float('feature_fraction', 0.2, 0.9)
#                      device = 'gpu'
                    )

        scores = []
        
        skf = KFold(n_splits = 5, shuffle = True, random_state = self.seed)

        for train_idx, valid_idx in skf.split(X, Y):

            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            Y_train , Y_valid = Y.iloc[train_idx] , Y.iloc[valid_idx]

            model = LGBMRegressor(**param).fit(X_train, Y_train)

            preds_valid = model.predict(X_valid)

            score = mean_squared_error(Y_valid, preds_valid, squared = False)
            scores.append(score)

        return np.mean(scores)
    
## Defining SEED and Trials
SEED = 42
N_TRIALS = 3

# Execute an optimization
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-01 00:13:13,535][0m A new study created in memory with name: no-name-ace4945a-7aa8-4e59-b19f-54d5fff49dfb[0m


-----------------------------
 (-: Optuna has started :-) 
-----------------------------


[32m[I 2023-03-01 00:13:55,035][0m Trial 0 finished with value: 666.6167485263261 and parameters: {'n_estimators': 2514, 'learning_rate': 0.46120247202105497, 'max_depth': 10, 'lambda_l1': 1.2462719056359346, 'lambda_l2': 3.742171521280458, 'num_leaves': 46, 'bagging_fraction': 0.8038831879477915, 'feature_fraction': 0.31692088856188116}. Best is trial 0 with value: 666.6167485263261.[0m




[32m[I 2023-03-01 00:14:32,833][0m Trial 1 finished with value: 739.9969990366424 and parameters: {'n_estimators': 2261, 'learning_rate': 0.0012050728529599626, 'max_depth': 9, 'lambda_l1': 0.0940354636913015, 'lambda_l2': 1.2748131012599297, 'num_leaves': 24, 'bagging_fraction': 0.8324713484464408, 'feature_fraction': 0.517875855446392}. Best is trial 0 with value: 666.6167485263261.[0m




[32m[I 2023-03-01 00:16:58,332][0m Trial 2 finished with value: 620.993002269427 and parameters: {'n_estimators': 9803, 'learning_rate': 0.06477894159495362, 'max_depth': 6, 'lambda_l1': 0.06506823133950966, 'lambda_l2': 0.11402088956281181, 'num_leaves': 68, 'bagging_fraction': 0.7104576292174876, 'feature_fraction': 0.7076787155895647}. Best is trial 2 with value: 620.993002269427.[0m
