In [1]:
#!kaggle competitions download -c playground-series-s4e4

# Import libraries

In [2]:
import pandas as pd

# Import data

In [3]:
submission_df = pd.read_csv("data/sample_submission.csv")
test_df = pd.read_csv("data/test.csv")
train_df = pd.read_csv("data/train.csv")

# Data exploration

In [4]:
train_df.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [5]:
train_df.isna().sum()

id                0
Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Whole weight.1    0
Whole weight.2    0
Shell weight      0
Rings             0
dtype: int64

No missing data

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90615 entries, 0 to 90614
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              90615 non-null  int64  
 1   Sex             90615 non-null  object 
 2   Length          90615 non-null  float64
 3   Diameter        90615 non-null  float64
 4   Height          90615 non-null  float64
 5   Whole weight    90615 non-null  float64
 6   Whole weight.1  90615 non-null  float64
 7   Whole weight.2  90615 non-null  float64
 8   Shell weight    90615 non-null  float64
 9   Rings           90615 non-null  int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 6.9+ MB


Only "Sex" column is object type so we'll need to encode it.

# Column encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder(sparse_output=False)
new_cols = pd.DataFrame(OHE.fit_transform(train_df[["Sex"]]), columns=OHE.get_feature_names_out())
new_cols.index = train_df.index
train = pd.concat([new_cols, train_df.drop(["Sex"], axis=1, inplace=False)], axis=1)

# Model

In [26]:
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [52]:
X = train.drop(["id", "Rings", "Sex_M"], axis=1)
y = train["Rings"]

In [10]:
skf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

In [57]:
lgbm_model = LGBMRegressor(force_col_wise=True)
score = cross_val_score(lgbm_model, X, y, scoring="neg_root_mean_squared_log_error", cv=skf)
lgbm_model.fit(X, y)
print(-score.mean())

[LightGBM] [Info] Total Bins 1332
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.697001
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.697084
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.696698
[LightGBM] [Info] Total Bins 1330
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.696601
[LightGBM] [Info] Total Bins 1331
[LightGBM] [Info] Number of data points in the train set: 72492, number of used features: 9
[LightGBM] [Info] Start training from score 9.696587
[LightGBM] [Info] Total Bins 1329
[LightGBM] [Info] Number of data points in the train set: 72492, number

In [25]:
xgb_model = XGBRegressor()
score = cross_val_score(xgb_model, X, y, scoring="neg_root_mean_squared_log_error", cv=skf)
print(-score.mean())

0.15119600774352782


In [29]:
rf_model = RandomForestRegressor(n_estimators=20)
score = cross_val_score(rf_model, X, y, scoring="neg_root_mean_squared_log_error", cv=skf)
print(-score.mean())

0.15756327572947174


In [34]:
from sklearn.inspection import permutation_importance

lgbm_model.fit(X, y)
result = permutation_importance(lgbm_model, X, y, scoring="neg_root_mean_squared_log_error", n_repeats=10, random_state=0)

[LightGBM] [Info] Total Bins 1337
[LightGBM] [Info] Number of data points in the train set: 90615, number of used features: 10
[LightGBM] [Info] Start training from score 9.696794


In [35]:
df = pd.DataFrame([list(X.columns), list(result.importances_mean)])
df = df.T
df.columns = df.iloc[0]
df = df[1:]
df = df.sort_values(by=df.columns[1], ascending=False)
print(df)

0           Sex_F 0.00024410552367450656
9    Shell weight               0.191138
7  Whole weight.1               0.115089
6    Whole weight               0.053539
5          Height               0.035364
8  Whole weight.2               0.008865
3          Length               0.007151
1           Sex_I               0.006074
4        Diameter               0.004963
2           Sex_M               0.000029


# Submission

In [54]:
new_cols = pd.DataFrame(OHE.transform(test_df[["Sex"]]), columns=OHE.get_feature_names_out())
new_cols.index = test_df.index
test = pd.concat([new_cols, test_df.drop(["Sex"], axis=1, inplace=False)], axis=1)
display(test)

Unnamed: 0,Sex_F,Sex_I,Sex_M,id,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight
0,0.0,0.0,1.0,90615,0.645,0.475,0.155,1.2380,0.6185,0.3125,0.3005
1,0.0,0.0,1.0,90616,0.580,0.460,0.160,0.9830,0.4785,0.2195,0.2750
2,0.0,0.0,1.0,90617,0.560,0.420,0.140,0.8395,0.3525,0.1845,0.2405
3,0.0,0.0,1.0,90618,0.570,0.490,0.145,0.8740,0.3525,0.1865,0.2350
4,0.0,1.0,0.0,90619,0.415,0.325,0.110,0.3580,0.1575,0.0670,0.1050
...,...,...,...,...,...,...,...,...,...,...,...
60406,0.0,1.0,0.0,151021,0.345,0.260,0.085,0.1775,0.0735,0.0265,0.0500
60407,1.0,0.0,0.0,151022,0.525,0.410,0.145,0.8445,0.3885,0.1670,0.2050
60408,0.0,1.0,0.0,151023,0.590,0.440,0.155,1.1220,0.3930,0.2000,0.2650
60409,1.0,0.0,0.0,151024,0.660,0.525,0.190,1.4935,0.5885,0.3575,0.4350


In [58]:
result = lgbm_model.predict(test.drop(["id", "Sex_M"], axis=1))
result_df = pd.DataFrame(result, columns=["Rings"])
submission = pd.concat([test["id"], result_df], axis=1)
display(submission)
submission.to_csv("submission/submission.csv", index=False)

Unnamed: 0,id,Rings
0,90615,9.731144
1,90616,9.669113
2,90617,10.247894
3,90618,10.389184
4,90619,7.571981
...,...,...
60406,151021,6.380426
60407,151022,9.232546
60408,151023,11.889941
60409,151024,13.525413


In [59]:
#!kaggle competitions submit -c playground-series-s4e4 -f submission/submission.csv -m "First submission"

Successfully submitted to Regression with an Abalone Dataset



  0%|          | 0.00/1.50M [00:00<?, ?B/s]
  1%|          | 8.00k/1.50M [00:00<01:03, 24.8kB/s]
 28%|██▊       | 432k/1.50M [00:00<00:00, 1.31MB/s] 
 70%|███████   | 1.05M/1.50M [00:00<00:00, 2.84MB/s]
100%|██████████| 1.50M/1.50M [00:01<00:00, 972kB/s] 
