In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 100)
pd.set_option('display.max_rows', 200)
import warnings
warnings.filterwarnings('ignore')

import gc

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
abalone = fetch_ucirepo(id=1) 
  
# data (as pandas dataframes)
orig_train = pd.concat([abalone.data.features, abalone.data.targets], axis = 1).rename({
    'Whole_weight' : 'Whole weight',
    'Shucked_weight' : 'Whole weight.1',
    'Viscera_weight' : 'Whole weight.2',
    'Shell_weight' : 'Shell weight'
}, axis = 1)
orig_train.to_csv('original_train.csv')

In [3]:
train = pd.read_csv(r'train.csv', index_col = 'id')
test = pd.read_csv(r'test.csv', index_col = 'id')
submission = pd.read_csv("sample_submission.csv")

In [4]:
train = pd.concat([train, orig_train], ignore_index=True)

In [5]:
def feats(df):
#     df['WW1_Ratio'] = df['Whole weight.1'] / df['Whole weight']
#     df['WW2_Ratio'] = df['Whole weight.2'] / df['Whole weight']
#     df['Shell_Ratio'] = df['Shell weight'] / df['Whole weight']
#     df['ProxyVolume'] = 2 * np.log1p(df["Diameter"]) +  np.log1p(df["Height"])
    df['Height']=np.where(df['Height']==0, 0.005, df['Height'])
    return df

train = feats(train)
test = feats(test)

In [7]:
from openfe import OpenFE, transform

ofe = OpenFE()
features = ofe.fit(data=train.drop('Rings', axis=1), label=train['Rings'], feature_boosting=True, n_jobs=8, task='regression', metric='rmse', n_data_blocks=512)
train_x, test_x = transform(train.drop('Rings', axis=1), test, features, n_jobs=8)

train_x.reset_index(drop=True, inplace=True)
train = pd.concat([train_x, train['Rings']], axis=1)
test = test_x.copy()

train.head()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 75833, number of used features: 8
[LightGBM] [Info] Start training from score 9.718698
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[272]	valid_0's rmse: 1.8389
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 75833, number of used features: 8
[LightGBM] [Info] Start training from score 9.698588
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[359]	valid_0's rmse: 1.87212
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ove

100%|██████████| 30/30 [00:19<00:00,  1.57it/s]


80 same features have been deleted.
The number of remaining candidate features is 78
Start stage II selection.


100%|██████████| 26/26 [00:06<00:00,  3.86it/s]


Finish data processing.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19412
[LightGBM] [Info] Number of data points in the train set: 75833, number of used features: 86


Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,autoFE_f_0,autoFE_f_1,autoFE_f_2,autoFE_f_3,autoFE_f_4,autoFE_f_5,autoFE_f_6,autoFE_f_7,autoFE_f_8,autoFE_f_9,autoFE_f_10,autoFE_f_11,autoFE_f_12,autoFE_f_13,autoFE_f_14,autoFE_f_15,autoFE_f_16,autoFE_f_17,autoFE_f_18,autoFE_f_19,autoFE_f_20,autoFE_f_21,autoFE_f_22,autoFE_f_23,autoFE_f_24,autoFE_f_25,autoFE_f_26,autoFE_f_27,autoFE_f_28,autoFE_f_29,autoFE_f_30,autoFE_f_31,autoFE_f_32,autoFE_f_33,autoFE_f_34,autoFE_f_35,autoFE_f_36,autoFE_f_37,autoFE_f_38,autoFE_f_39,autoFE_f_40,autoFE_f_41,autoFE_f_42,autoFE_f_43,autoFE_f_44,autoFE_f_45,autoFE_f_46,autoFE_f_47,autoFE_f_48,autoFE_f_49,autoFE_f_50,autoFE_f_51,autoFE_f_52,autoFE_f_53,autoFE_f_54,autoFE_f_55,autoFE_f_56,autoFE_f_57,autoFE_f_58,autoFE_f_59,autoFE_f_60,autoFE_f_61,autoFE_f_62,autoFE_f_63,autoFE_f_64,autoFE_f_65,autoFE_f_66,autoFE_f_67,autoFE_f_68,autoFE_f_69,autoFE_f_70,autoFE_f_71,autoFE_f_72,autoFE_f_73,autoFE_f_74,autoFE_f_75,autoFE_f_76,autoFE_f_77,Rings
0,0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,2672.0,2.348554,1.36875,0.443,0.0885,0.489898,-0.0935,0.157544,0.036,136.0,0.09,3.666667,0.24,0.31,2.242321,0.19,0.4035,340.0,601.0,0.035397,3.214583,0.5765,0.5685,0.24,0.243335,5.266212,0.24,-0.0035,0.141255,0.27863,2.935154,0.1465,2.14,0.7715,0.7,,1.2015,0.062995,,0.6965,0.610417,0.132,,0.557356,0.24,0.2835,,0.725,0.049275,0.106898,0.79,0.55,0.3285,-1.427116,0.918,0.3285,-0.3415,0.4405,0.215506,0.7585,0.080575,0.24,0.8785,0.115725,0.43,0.24,1.0,0.1465,0.1465,0.253438,0.67,0.24,0.24,0.575,1.1,1.0115,0.24,1.1375,11
1,0,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,1204.0,2.467249,1.43125,0.672,0.138,0.565685,-0.0435,0.83746,0.0464,88.0,0.175,4.344828,0.32,0.31,1.65642,0.17,0.3535,86.0,40.0,0.035333,3.53125,0.7665,0.778,0.32,0.95693,4.086799,0.32,0.1315,0.22442,0.715119,1.772152,0.2765,1.8075,0.13,0.775,,1.62,0.135485,,0.9065,0.864063,0.2016,,0.433628,0.32,0.2135,,0.72,0.06641,0.963678,0.95,0.63,0.458,-1.139434,1.4065,0.458,-0.64,0.4695,0.649572,0.948,0.174195,0.32,1.088,0.16385,0.49,0.32,1.0,0.2765,0.2765,0.51754,0.81,0.32,0.32,0.585,1.588,1.45,0.32,0.876,11
2,1,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,499.0,3.818182,1.1,0.0155,0.0005,0.070711,-0.002,0.155689,0.000125,102.0,-0.02,6.4,0.005,0.155,1.833333,0.105,0.157,58.0,65.0,0.022114,4.2,0.113,0.0105,0.005,0.344311,7.0,0.005,-0.022,0.000605,0.00464,36.666667,0.003,0.0545,0.021,0.185,0.16,0.131,0.00033,0.005,0.163,0.6,0.0008,0.021,5.238095,-0.995,0.107,0.0055,0.215,0.000138,0.51497,0.165,0.16,0.0055,-5.298317,0.024,0.025,0.089,0.0125,0.008107,0.1155,0.00048,1.0,0.1655,0.000525,0.021,0.005,1.0,0.003,0.003,0.000116,0.115,0.005,0.025,0.185,0.0265,0.026,0.005,0.024,6
3,2,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,2130.0,2.435419,1.502,0.539,0.1255,0.5,-0.0445,0.322763,0.0375,112.0,0.1,3.966667,0.25,0.345,1.827251,0.225,0.3895,379.0,102.0,0.035397,3.658,0.6805,0.6255,0.25,0.349879,4.450122,0.25,0.0555,0.178362,0.682618,2.311436,0.2055,2.14,0.9145,0.745,0.2975,1.3895,0.097612,0.125,0.8005,0.822,0.14875,0.45725,0.51941,-1.75,0.2695,0.18775,0.725,0.056325,0.38982,0.845,0.595,0.3755,-1.386294,1.12,0.3755,-0.4395,0.4405,0.440061,0.8505,0.122272,2.0,0.9705,0.137175,0.475,0.25,1.0,0.2055,0.2055,0.343395,0.725,0.25,0.25,0.575,1.29,1.1645,0.25,1.1375,10
4,1,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,34.0,2.116373,1.870886,0.4125,0.172,0.44441,-0.0375,0.835659,0.025675,86.0,0.0675,4.269231,0.1975,0.3575,2.309375,0.2275,0.395,382.0,588.0,0.038715,3.959494,0.585,0.567,0.1975,0.508903,4.8875,0.1975,0.03,0.157037,0.897435,2.65625,0.16,1.556,0.782,0.685,0.555,1.207,0.068,0.1975,0.715,0.810127,0.109613,0.782,0.543478,-0.8025,0.265,0.3695,0.74,0.048035,0.721447,0.7525,0.555,0.3695,-1.622017,0.942,0.3695,-0.357,0.395,0.908312,0.7945,0.0888,1.0,0.9245,0.10166,0.425,0.1975,1.0,0.16,0.16,0.288949,0.6225,0.1975,0.1975,0.585,1.1515,0.9795,0.1975,0.8175,9


In [8]:
train.Rings = np.log1p(train.Rings)

In [9]:
from autogluon.tabular import TabularDataset, TabularPredictor
from custom_metrics import rmsle_scorer

time_limit = 3600*16

automl = TabularPredictor(label='Rings', problem_type='regression',
                          eval_metric=rmsle_scorer
                          )

automl.fit(train, presets='medium_quality', time_limit=time_limit, num_bag_folds=8, num_bag_sets=0, num_stack_levels=2, dynamic_stacking=False, 
            included_model_types=['XGB', 'CAT', 'GBM', 'XT', 'RF'], ag_args_fit={'num_gpus': 1, 'num_cpus': 8},
          )

No path specified. Models will be saved in: "AutogluonModels\ag-20240420_171305"
Presets specified: ['medium_quality']
Beginning AutoGluon training ... Time limit = 57600s
AutoGluon will save models to "AutogluonModels\ag-20240420_171305"
AutoGluon Version:  1.0.1b20240321
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       6.49 GB / 15.42 GB (42.1%)
Disk Space Avail:   529.53 GB / 931.51 GB (56.8%)
Train Data Rows:    94792
Train Data Columns: 86
Label Column:       Rings
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6639.47 MB
	Train Data (Original)  Memory Usage: 61.83 MB (0.9% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 G

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x14502277b80>

In [17]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L4,-0.044251,mean_squared_log_error,32.427107,1885.998389,0.003976,6.068096,4,True,24
1,WeightedEnsemble_L3,-0.044276,mean_squared_log_error,26.423582,1618.608454,0.002997,3.186564,3,True,16
2,WeightedEnsemble_L2,-0.044288,mean_squared_log_error,12.777632,761.739867,0.003983,2.826576,2,True,8
3,CatBoost_BAG_L2,-0.044342,mean_squared_log_error,13.108655,881.966281,0.053005,52.602862,2,True,12
4,LightGBM_BAG_L2,-0.044347,mean_squared_log_error,13.187657,882.296598,0.132007,52.933179,2,True,10
5,CatBoost_BAG_L3,-0.044375,mean_squared_log_error,26.745604,1783.254404,0.051001,49.344402,3,True,20
6,LightGBMXT_BAG_L3,-0.044381,mean_squared_log_error,26.8976,1789.319443,0.202998,55.40944,3,True,17
7,LightGBMXT_BAG_L2,-0.044385,mean_squared_log_error,13.210649,883.253505,0.155,53.890086,2,True,9
8,LightGBM_BAG_L3,-0.044407,mean_squared_log_error,26.829609,1787.378722,0.135006,53.46872,3,True,18
9,LightGBMLarge_BAG_L1,-0.044407,mean_squared_log_error,0.315989,121.840561,0.315989,121.840561,1,True,7


In [11]:
predictions = automl.predict(test)

In [12]:
submission = predictions.reset_index().rename(columns={0: 'Rings'})

In [13]:
submission.Rings = np.expm1(submission.Rings)

In [14]:
submission.Rings = np.clip(submission.Rings, 1, 29)

In [15]:
submission.to_csv("submission.csv",index=False)

In [16]:
submission.Rings.mean()

9.58127