In [None]:
%%time
import subprocess
import sys

def install_and_import(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        __import__(package)

packages = [
    ('sklearn', '*'), 
    ('tqdm', 'tqdm'), 
    ('re', 're'), 
    ('numpy', 'np'), 
    ('pandas', 'pd'), 
    ('matplotlib.pyplot', 'plt'), 
    ('seaborn', 'sns'), 
    ('os', 'os'), 
    ('warnings', 'warnings'), 
    ('xgboost', 'xgboost'), 
    ('catboost', 'catboost'), 
    ('lightgbm', 'lightgbm'), 
    ('tabulate', 'tabulate'), 
    ('statsmodels', 'ARIMA'), 
    ('colorama', 'Fore, Style, init'), 
    ('category_encoders', '*'), 
    ('mlxtend', '*'), 
    ('optuna', 'optuna'), 
]

# Install and import packages
for package, import_as in packages:
    install_and_import(package)

# Specific imports
from sklearn.pipeline import *
from tqdm import tqdm
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.tree import *
from sklearn.ensemble import *
from xgboost import *
from catboost import *
from sklearn.linear_model import *
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import *
from sklearn.compose import *
from sklearn.neural_network import *
from lightgbm import *
from tabulate import tabulate
from statsmodels.tsa.arima.model import ARIMA
from colorama import Fore, Style, init
from category_encoders import *
from mlxtend.evaluate import *
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args
from sklearn.base import *
from sklearn.inspection import *
import optuna
from optuna.visualization import *
from optuna.pruners import *
import logging
import sys
from sklearn.decomposition import *

# Add stream handler of stdout to show the messages to see Optuna works expectedly.
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
warnings.filterwarnings('ignore')

# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
UseOriginalDataset=False
UseAdditionalData=False
UseMyEncoding=False
UseFE=True

n_splits=2
n_repeats=3
seed=0
copies=1

In [None]:
from sklearn.metrics import matthews_corrcoef

def create_folds(X_train, y_train, origin, n_splits=2,n_repeats=n_repeats,copies=1, random_state=None):
    folds = []
    kf =RepeatedKFold(n_splits=n_splits,n_repeats=n_repeats,random_state=0)
    target_bins = np.floor(y_train).astype(int)
    for train_index, valid_index in kf.split(X_train,target_bins):
        if UseOriginalDataset:
            train_index_with_origin = np.concatenate((train_index,np.arange(copies*len(origin)) + len(X_train)))
            folds.append((train_index_with_origin, valid_index))
        else:
            folds.append((train_index, valid_index))
    return folds


def get_combined_data(df,original,copies):
    for _ in range(copies):
        df = pd.concat([df,original],ignore_index=True)
    return 


def simple_cv(model, X, y, folds, silent=True):
    r2_scores = []
    for train_index, valid_index in folds:
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        model.fit(X_train,y_train)
        y_pred = model.predict(X_valid)
        
#             y_pred = np.array(y_pred).reshape(-1, 1)
#             y_pred = scaler.inverse_transform(y_pred)

        r2 =  mean_squared_error(y_valid, y_pred,squared=False)#Change metric
        r2_scores.append(r2)
    if not silent:
        print("Cross-Validation Scores:", r2_scores)
        print("Maximum CV score:", np.min(r2_scores))
        print("Mean CV Score:", np.mean(r2_scores))
    return np.mean(r2_scores)

In [None]:
%%time

import polars as pl
import pandas as pd

# Read train dataset and convert to pandas
train = pl.read_csv("/kaggle/input/playground-series-s4e9/train.csv").drop("id").to_pandas()
print("Train Dataset =", train.shape)
display(train)

# Read test dataset and convert to pandas
test = pl.read_csv("/kaggle/input/playground-series-s4e9/test.csv").drop("id").to_pandas()
print("Test Dataset =", test.shape)
display(test)

# Read sample submission dataset using pandas
submission = pd.read_csv("/kaggle/input/playground-series-s4e9/sample_submission.csv")

# Read original dataset and convert to pandas
origin = pl.read_csv("/kaggle/input/used-car-price-prediction-dataset/used_cars.csv").to_pandas()
print("Original Dataset =", origin.shape)
display(origin)


add_data=pl.read_csv("/kaggle/input/kagglex-official-dataset/train.csv").drop("id").to_pandas()
print("Additional Original Dataset =", add_data.shape)
display(add_data)



# Target variable
target = "price"


In [None]:
if UseOriginalDataset:
    df=get_combined_data(train,origin,copies)
else:
    df=train.copy()
    
    
if UseAdditionalData:
    df=get_combined_data(df,add_data,copies)


num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")
df = df.drop_duplicates(keep='last')
num_duplicates = df.duplicated().sum()
print(f"Number of duplicate rows After Removal: {num_duplicates}")

## Handling Missing Value

In [None]:
df['fuel_type'] = df['fuel_type'].fillna('none')
test['fuel_type'] = test['fuel_type'].fillna('none')

df['accident'] = df['accident'].fillna('empty')
test['accident'] = test['accident'].fillna('empty')

df['clean_title'] = df['clean_title'].fillna('empty')
test['clean_title'] = test['clean_title'].fillna('empty')

In [None]:
if UseMyEncoding:
    unknown_test_model_col=list(set(test['model'].unique())-set(df['model'].unique()))
    print(unknown_test_model_col)
    # unknown_origin_model_col=list(set(origin['model'].unique())-set(df['model'].unique()))
    # print(unknown_origin_model_col)
    thresholds=[1,5]
    value_counts = df['model'].value_counts()
    rare_categories_dict = {f'rare{i+1}': [] for i in range(len(thresholds))}
    def assign_rare_category(color, value_counts, thresholds):
        if color in unknown_test_model_col:
            rare_categories_dict['rare1'].append(color)
            return 'rare1'
        for i, threshold in enumerate(thresholds):
            if value_counts[color] <= threshold:
                if i == 0 or value_counts[color] > thresholds[i - 1]:
                    rare_categories_dict[f'rare{i+1}'].append(color)
                    return f'rare{i+1}'
        return color
    df['model'] =df['model'].apply(assign_rare_category, value_counts=value_counts, thresholds=thresholds)
    # origin['model'] = origin['model'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))
    test['model'] = test['model'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))

In [None]:
test['engine'] = test['engine'].replace(
    '177.0HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas',
    '139.0HP 1.6L 4 Cylinder Engine Plug-In Electric/Gas'
)

# Extract horsepower
def engine_feat_extract(data):
    dff=data.copy()
    dff['horsepower'] = dff['engine'].str.extract(r'(\d+\.\d+)HP')
    
    dff['engine'] = dff['engine'].apply(lambda x: x.replace(' Litre', 'L'))
    dff['engine'] = dff['engine'].apply(lambda x: x.replace(' Liter', 'L'))
    dff['engine'] = dff['engine'].apply(lambda x: x.replace(' L', 'L'))
    dff['engine'] = dff['engine'].apply(lambda x: x.replace('V-', 'V'))
    
    dff['engine_capacity'] = dff['engine'].str.extract(r'(\d+\.\d+)L')

    dff['cylinder_type'] = dff['engine'].str.extract(r'( \d+ | V\d+ | I\d+ | W\d+ |I\d+ |V\d+ |V\d+| H\d+ |I\d+)')
    dff['cylinder_type'] = dff['cylinder_type'].str.strip()
    
    dff['is_electric'] = dff['engine'].str.contains('Electric').astype(int)
    dff['DOHC'] = dff['engine'].str.contains('DOHC').astype(int)
    dff['GDI'] = dff['engine'].str.contains('GDI').astype(int)
    dff['MPFI'] = dff['engine'].str.contains('MPFI').astype(int)
    
#     dff['Gasoline'] = dff['engine'].str.contains('Gasoline').astype(int)
#     dff['Fuel'] = dff['engine'].str.contains('Fuel').astype(int)
    
    dff['voltage'] = dff['engine'].str.extract(r'(\d+V)')
    dff['voltage'] = dff['voltage'].str.replace('V', '')
    
#     df['engine'] = df['engine'].str.replace(r' \d+ | V\d+ | I\d+ | W\d+ |I\d+ |V\d+ |V\d+| H\d+ |I\d', '', regex=True)
#     df['engine'] = df['engine'].str.replace(r'\d+\.\d+L', '', regex=True)
#     df['engine'] = df['engine'].str.replace(r'\d+\.\d+HP', '', regex=True)
#     df['engine'] = df['engine'].str.replace(r'\d+V', '', regex=True)
#     df['engine'] = df['engine'].str.replace(' Electric', '', regex=False)
#     df['engine'] = df['engine'].str.replace('Electric', '', regex=False)
#     df['engine'] = df['engine'].str.replace('DOHC', '', regex=False)
#     df['engine'] = df['engine'].str.replace('GDI', '', regex=False)
#     df['engine'] = df['engine'].str.replace('MPFI', '', regex=False)
#     df['engine'] = df['engine'].str.strip()
    return dff
if UseFE:
    df=engine_feat_extract(df)
    test=engine_feat_extract(test)

# df[['horsepower','engine_capacity','cylinder_type','is_electric']]

In [None]:
# handling missing values/Nan
if UseFE:
    engine_feats=['voltage','horsepower','engine_capacity','cylinder_type','is_electric','DOHC','GDI','MPFI']
    columns_to_fill=['voltage','horsepower','engine_capacity','cylinder_type']
    df[columns_to_fill] = df[columns_to_fill].fillna(-1)
    test[columns_to_fill] = test[columns_to_fill].fillna(-1)

    df[['voltage','horsepower','engine_capacity']]=df[['voltage','horsepower','engine_capacity']].astype(float)
    test[['voltage','horsepower','engine_capacity']]=test[['voltage','horsepower','engine_capacity']].astype(float)

In [None]:
def transmission_feat_extract(data):
    dff=data.copy()
    dff['transmission'] = dff['transmission'].apply(lambda x: x.replace('Automatic', 'A/T'))
    dff['transmission'] = dff['transmission'].apply(lambda x: x.replace('Manual', 'M/T'))
    dff['transmission'] = dff['transmission'].apply(lambda x: x.replace('At', 'A/T'))
    dff['transmission'] = dff['transmission'].apply(lambda x: x.replace('Mt', 'M/T'))
    dff['transmission'] = dff['transmission'].apply(lambda x: x.replace('6 Speed', '6-Speed'))
    dff['speed'] = dff['transmission'].str.extract(r'(\d+-Speed|\d)')
    dff['speed']=dff['speed'].astype(str).apply(lambda x: x.replace('-Speed', ''))
    dff['speed'] = dff['speed'].str.strip()
#     dff['speed']=dff['speed'].astype(int)
    
    dff['AT'] = dff['transmission'].str.contains('A/T').astype(int)
    dff['MT'] = dff['transmission'].str.contains('M/T').astype(int)
    
#     dff['transmission'] = dff['transmission'].str.replace(r'\d+-Speed|\d', '', regex=True)
#     dff['transmission'] = dff['transmission'].str.replace('M/T', '', regex=False)
#     dff['transmission'] = dff['transmission'].str.replace('A/T', '', regex=False)
#     dff['transmission'] = dff['transmission'].str.split(',').str[0]
#     dff['transmission'] = dff['transmission'].str.strip()
    return dff

if UseFE:
    df=transmission_feat_extract(df)
    test=transmission_feat_extract(test)


In [None]:
df['ext_col']=df['ext_col'].str.lower()
test['ext_col']=test['ext_col'].str.lower()
df['int_col']=df['int_col'].str.lower()
test['int_col']=test['int_col'].str.lower()

In [None]:
if UseMyEncoding:
    unknown_test_brand_col=list(set(test['brand'].unique())-set(df['brand'].unique()))
    print(unknown_test_brand_col)
    unknown_origin_brand_col=list(set(add_data['brand'].unique())-set(df['brand'].unique()))
    print(unknown_origin_brand_col)

    thresholds=[1,5]
    value_counts = df['brand'].value_counts()
    rare_categories_dict = {f'rare{i+1}': [] for i in range(len(thresholds))}
    def assign_rare_category(color, value_counts, thresholds):
        if color in unknown_test_brand_col or color in unknown_origin_brand_col:
            rare_categories_dict['rare1'].append(color)
            return 'rare1'
        for i, threshold in enumerate(thresholds):
            if value_counts[color] <= threshold:
                if i == 0 or value_counts[color] > thresholds[i - 1]:
                    rare_categories_dict[f'rare{i+1}'].append(color)
                    return f'rare{i+1}'
        return color
    df['brand'] =df['brand'].apply(assign_rare_category, value_counts=value_counts, thresholds=thresholds)
    origin['brand'] = origin['brand'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))
    test['brand'] = test['brand'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))

In [None]:
if UseMyEncoding:
    unknown_test_int_col=list(set(test['int_col'].unique())-set(df['int_col'].unique()))
    unknown_origin_int_col=list(set(add_data['int_col'].unique())-set(train['int_col'].unique()))
    thresholds=[1,5,10]
    value_counts = df['int_col'].value_counts()
    rare_categories_dict = {f'rare{i+1}': [] for i in range(len(thresholds))}
    def assign_rare_category(color, value_counts, thresholds):
        if color in unknown_test_int_col or color in unknown_origin_int_col:
            rare_categories_dict['rare1'].append(color)
            return 'rare1'
        for i, threshold in enumerate(thresholds):
            if value_counts[color] <= threshold:
                if i == 0 or value_counts[color] > thresholds[i - 1]:
                    rare_categories_dict[f'rare{i+1}'].append(color)
                    return f'rare{i+1}'
        return color
    df['int_col'] =df['int_col'].apply(assign_rare_category, value_counts=value_counts, thresholds=thresholds)
    test['int_col'] = test['int_col'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))

In [None]:
if UseMyEncoding:
    unknown_test_ext_col=list(set(test['ext_col'].unique())-set(df['ext_col'].unique()))
    unknown_origin_ext_col=list(set(add_data['ext_col'].unique())-set(train['ext_col'].unique()))
    thresholds=[1,5,10]
    value_counts = df['ext_col'].value_counts()
    rare_categories_dict = {f'rare{i+1}': [] for i in range(len(thresholds))}
    def assign_rare_category(color, value_counts, thresholds):
        if color in unknown_test_ext_col or color in unknown_origin_ext_col:
            rare_categories_dict['rare1'].append(color)
            return 'rare1'
        for i, threshold in enumerate(thresholds):
            if value_counts[color] <= threshold:
                if i == 0 or value_counts[color] > thresholds[i - 1]:
                    rare_categories_dict[f'rare{i+1}'].append(color)
                    return f'rare{i+1}'
        return color
    df['ext_col'] =df['ext_col'].apply(assign_rare_category, value_counts=value_counts, thresholds=thresholds)
    test['ext_col'] = test['ext_col'].apply(lambda x: assign_rare_category(x, value_counts=value_counts, thresholds=thresholds))

In [None]:
le = LabelEncoder()

categorical_columns = test.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col] = df[col].astype(str)
    test[col] = test[col].astype(str)
    
categorical_columns = list(categorical_columns)
cat_cols=categorical_columns
print(categorical_columns)
# categorical_columns.append('model_year')

for col in categorical_columns:
    print(col)
    df[col] = le.fit_transform(df[col])
    test[col] = le.transform(test[col])


print(df.columns)

In [None]:
cat_feats=['brand', 'model','engine', 'int_col', 'ext_col','accident','cylinder_type','transmission','fuel_type']

## For Feature Selection

In [None]:
# %%time


# data=df.copy()
# data=data.reset_index().drop('index',axis=1)
# X=data.drop(target,axis=1)
# rng = np.random.default_rng(42)
# X['NOISE1'] = rng.normal(size=len(X))
# X['NOISE2'] = rng.normal(size=len(X))
# X['NOISE3'] = rng.normal(size=len(X))

# n_splits=2
# n_repeats=3
# kfold =RepeatedKFold(n_splits=n_splits,n_repeats=n_repeats,random_state=0)
# clfs = {
#     'lightgbm':hgb,
# #     'hist_gradient_boosting':hgb,
# #     'gradient_boosting':gb,
# }

# # for col in labels:
# #     print(f"\n{'=' * 20} For TARGET: {col} {'=' * 20}\n")
# y=data[target]
# _, axes = plt.subplots(len(clfs),1,figsize=(20, len(clfs)*5))
# for i,m in enumerate(clfs):
#     scores = 0
#     roc_auc_scores=[]
#     with tqdm(total=n_splits*n_repeats) as pbar:
#         for train_index, valid_index in kfold.split(X,y):
#             X_t, X_v = X.iloc[train_index], X.iloc[valid_index]
#             y_t, y_v = y[train_index], y[valid_index]
#             model=clone(clfs[m])
#             model.fit(X_t,y_t)
#             y_pred=model.predict(X_v)
#             roc_auc =  mean_squared_error(y_v, y_pred,squared=False)#Change the metric
#             roc_auc_scores.append(roc_auc)
#             scores += permutation_importance(clone(clfs[m]).fit(X_t,y_t),
#                                              X_v,y_v,
#                                              scoring='neg_root_mean_squared_error',#Change the metric
#                                              n_jobs=-1,
#                                              random_state=seed).importances_mean
#             pbar.update()
#         pbar.close()
#     roc_auc_scores = np.array(roc_auc_scores)
#     print(f"Average Accuracy score of {m}==>{roc_auc_scores.mean()} Â± {roc_auc_scores.std()}")
#     print(f"Minimum Accuracy score of {m}==>{np.min(roc_auc_scores)}")
#     s = pd.Series(scores/n_splits/n_repeats,index=X.columns).sort_values(ascending=False)
#     s.plot(kind='barh', ax=axes, color=['red' if c.startswith('NOISE') else 'green' for c in s.index])
#     axes.invert_yaxis()
#     axes.set_xscale('function', functions=(lambda x: np.sign(x)*np.abs(x)**(1/4), lambda x: np.sign(x)*np.abs(x)**4))
#     axes.set_title(f'{m} - CROSS-VALIDATED PERMUTATION IMPORTANCE')

# plt.tight_layout()
# plt.show()
# plt.clf()
# plt.close() 
# plt.rcdefaults()
# sns.reset_defaults()

In [None]:
if UseFE:
    df=df.drop(['clean_title','AT','MT','is_electric','MPFI'],axis=1)
    test=test.drop(['clean_title','AT','MT','is_electric','MPFI'],axis=1)
else :
    df=df.drop(['clean_title'],axis=1)
    test=test.drop(['clean_title'],axis=1)

In [None]:
# cat_params={'boosting_type': 'Ordered',
#             'eta': 0.003599532031747313,
#             'n_estimators': 785,
#             'bootstrap_type': 'Bernoulli',
#             'reg_lambda': 0.15814818843258527, 'depth': 5,
#             'max_bin': 251,
#             'subsample': 0.5767785369941868
#            }

lgb_params={'n_estimators': 1890,
                 'learning_rate': 0.0049343166168420195,
                 'data_sample_strategy': 'goss',
                 'feature_fraction': 0.3887459059437565, 
                 'lambda_l1': 7.239967197949322e-07, 
                 'lambda_l2': 7.488955354504223e-06, 
                 'num_leaves': 1440, 
                 'max_depth': 8, 
                 'colsample_bytree': 0.8390384224124089, 
                 'min_child_samples': 123, 
                 'min_gain_to_split': 1.491437722787296, 
                 'max_bin': 246
                }

hgb_params={'learning_rate': 0.01896652440172408,
            'max_iter': 2291,
            'max_leaf_nodes': 1915,
            'max_depth': 4, 
            'min_samples_leaf': 93, 
            'l2_regularization': 5.726868918336535,
            'max_bins': 60, 
            'tol': 3.5296168509745593e-08
           }

gb_params={'learning_rate': 0.0359605832217767,
           'n_estimators': 626, 
           'subsample': 0.8182203880912126,
           'criterion': 'friedman_mse', 
           'min_samples_split': 98, 
           'min_samples_leaf': 94, 
           'min_weight_fraction_leaf': 0.0039333091548447805, 
           'max_depth': 13, 
           'min_impurity_decrease': 0.01946397382407441, 
           'max_features': 'log2', 
           'alpha': 0.3688013161101175, 
           'max_leaf_nodes': 55, 
           'tol': 9.67892273042649e-05
          }

In [None]:
model=LGBMRegressor(**{**base_params_lgb,**lgb_params})

model_pipe=TransformedTargetRegressor(
        regressor=model,
        transformer=StandardScaler()
    )
pipe=BaggingRegressor(estimator=model_pipe,n_estimators=200)

pipe.fit(X,y)
preds=pipe.predict(test)
submission[target]=preds
submission.to_csv("Tunned_lgb_bag200.csv",index=False)

In [None]:
model=GradientBoostingRegressor(**gb_params, random_state=seed)

model_pipe=TransformedTargetRegressor(
        regressor=model,
        transformer=StandardScaler()
    )
pipe=BaggingRegressor(estimator=model_pipe,n_estimators=200)

pipe.fit(X,y)
preds=pipe.predict(test)
submission[target]=preds
submission.to_csv("Tunned_gb_bag200.csv",index=False)

In [None]:
model=HistGradientBoostingRegressor(**hgb_params,random_state=seed)

model_pipe=TransformedTargetRegressor(
        regressor=model,
        transformer=StandardScaler()
    )
pipe=BaggingRegressor(estimator=model_pipe,n_estimators=200)

pipe.fit(X,y)
preds=pipe.predict(test)
submission[target]=preds
submission.to_csv("Tunned_hgb_bag200.csv",index=False)