# Proyecto 1

**Dataset Description**

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Flood Prediction Factors dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.

Note: This dataset is particularly well suited for visualizations, clustering, and general EDA. Show off your skills!

Files

- train.csv - the training dataset; FloodProbability is the target
- test.csv - the test dataset; your objective is to predict the FloodProbability for each row
- sample_submission.csv - a sample submission file in the correct format

Evaluation

Submissions are evaluated using the R2 score.

## Exploratory Data Analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
# modelos baseline

# métodos basados en árboles
import xgboost as xgb

# metricas
from sklearn.metrics import r2_score

In [2]:
def load_data():
    train = pd.read_csv("data/train.csv")
    train.id = train['id'].astype(str)
    test = pd.read_csv("data/test.csv")
    test.id = test['id'].astype(str)
    return train, test

In [3]:
train, test = load_data()

In [4]:
initial_cols = [col for col in train.columns if col not in ['id', 'FloodProbability']]

In [5]:
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [6]:
for col in initial_cols:
    train[col] = train[col].astype(float)
for col in test.columns:
    test[col] = test[col].astype(float)

In [7]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2 , interaction_only=False, include_bias=False)

train_poly = pd.DataFrame(poly.fit_transform(train[initial_cols]))
train_poly.columns = poly.get_feature_names_out()

test_poly = pd.DataFrame(poly.transform(test[initial_cols]))
test_poly.columns = poly.get_feature_names_out()

In [8]:
train_poly.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,PopulationScore^2,PopulationScore WetlandLoss,PopulationScore InadequatePlanning,PopulationScore PoliticalFactors,WetlandLoss^2,WetlandLoss InadequatePlanning,WetlandLoss PoliticalFactors,InadequatePlanning^2,InadequatePlanning PoliticalFactors,PoliticalFactors^2
0,5.0,8.0,5.0,8.0,6.0,4.0,4.0,3.0,3.0,4.0,...,49.0,35.0,49.0,21.0,25.0,35.0,15.0,49.0,21.0,9.0
1,6.0,7.0,4.0,4.0,8.0,8.0,3.0,5.0,4.0,6.0,...,9.0,9.0,12.0,9.0,9.0,12.0,9.0,16.0,12.0,9.0
2,6.0,5.0,6.0,7.0,3.0,7.0,1.0,5.0,4.0,5.0,...,64.0,16.0,24.0,24.0,4.0,6.0,6.0,9.0,9.0,9.0
3,3.0,4.0,6.0,5.0,4.0,8.0,4.0,7.0,6.0,8.0,...,36.0,30.0,42.0,30.0,25.0,35.0,25.0,49.0,35.0,25.0
4,5.0,3.0,2.0,6.0,4.0,4.0,3.0,3.0,3.0,3.0,...,1.0,2.0,3.0,5.0,4.0,6.0,10.0,9.0,15.0,25.0


In [126]:
# pca poly features
from sklearn.decomposition import PCA

pca = PCA(n_components=8, random_state=42)

pca_train = pd.DataFrame(pca.fit_transform(train_poly))
pca_train.columns = [f'pca_{i}' for i in range(pca.n_components_)]
pca_test = pd.DataFrame(pca.transform(test_poly))
pca_test.columns = [f'pca_{i}' for i in range(pca.n_components_)]
train = pd.concat([train, pca_train], axis=1)
test = pd.concat([test, pca_test], axis=1)

# porcentaje de varianza explicada
print(pca.explained_variance_ratio_.sum())

0.39159095356802054


In [127]:
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,PoliticalFactors,FloodProbability,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7
0,0,5.0,8.0,5.0,8.0,6.0,4.0,4.0,3.0,3.0,...,3.0,0.445,-33.04435,-83.956641,-30.96796,6.896072,-20.711519,13.254741,38.100631,47.224782
1,1,6.0,7.0,4.0,4.0,8.0,8.0,3.0,5.0,4.0,...,3.0,0.45,-35.249947,-61.084405,-41.508118,8.422374,-18.154838,54.321937,-6.105195,29.289434
2,2,6.0,5.0,6.0,7.0,3.0,7.0,1.0,5.0,4.0,...,3.0,0.53,-0.211053,-19.220556,-15.952179,-9.791212,-6.511861,-46.871394,-93.977399,-3.053915
3,3,3.0,4.0,6.0,5.0,4.0,8.0,4.0,7.0,6.0,...,5.0,0.535,34.433909,18.009123,-44.781225,-26.676838,-26.930496,-45.763325,-23.299845,12.118694
4,4,5.0,3.0,2.0,6.0,4.0,4.0,3.0,3.0,3.0,...,5.0,0.415,-158.397635,18.052393,-17.616177,23.255651,28.844234,27.005023,-15.505706,-27.462001


### Cluster data

In [128]:
vals_in_common = [x for x in train.columns.intersection(test.columns) if x not in ['id', 'FloodProbability']]

In [129]:
from sklearn.cluster import KMeans

In [130]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train[vals_in_common].copy())
train_clust = scaler.transform(train[vals_in_common].copy())

In [131]:
cluster = KMeans(n_clusters=5, random_state=42)
cluster.fit(train_clust)

In [132]:
train['cluster'] = cluster.predict(train_clust).astype(str)
test['cluster'] = cluster.predict(scaler.transform(test[vals_in_common].copy())).astype(str)

In [133]:
train = pd.get_dummies(train, columns=['cluster'])
test = pd.get_dummies(test, columns=['cluster'])

In [134]:
# add all values prod 2 and exp
# for i in range(len(vals_in_common)):
#    train[f'{vals_in_common[i]}_{vals_in_common[i]}'] = train[vals_in_common[i]] * train[vals_in_common[i]]
#    test[f'{vals_in_common[i]}_{vals_in_common[i]}'] = test[vals_in_common[i]] * test[vals_in_common[i]]

In [135]:
train['sum_all'] = train[initial_cols].sum(axis=1)
test['sum_all'] = test[initial_cols].sum(axis=1)

train["over_sum"] = train["sum_all"].isin(np.arange(100,110))
test["over_sum"] = test["sum_all"].isin(np.arange(100,110))
#new              
train["under_sum"] = train["sum_all"].isin(np.arange(72,76))
test["under_sum"] = test["sum_all"].isin(np.arange(72,76))#special1 

train['mean_all'] = train[initial_cols].mean(axis=1)
test['mean_all'] = test[initial_cols].mean(axis=1)

train['std_all'] = train[initial_cols].std(axis=1)
test['std_all'] = test[initial_cols].std(axis=1)

train['kur_all'] = train[initial_cols].kurtosis(axis=1)
test['kur_all'] = test[initial_cols].kurtosis(axis=1)

train['skew_all'] = train[initial_cols].skew(axis=1)
test['skew_all'] = test[initial_cols].skew(axis=1)

train['median_all'] = train[initial_cols].median(axis=1)
test['median_all'] = test[initial_cols].median(axis=1)

train['max_all'] = train[initial_cols].max(axis=1)
test['max_all'] = test[initial_cols].max(axis=1)

train['min_all'] = train[initial_cols].min(axis=1)
test['min_all'] = test[initial_cols].min(axis=1)

train['range_all'] = train[initial_cols].max(axis=1) - train[initial_cols].min(axis=1) 
test['range_all'] = test[initial_cols].max(axis=1) - test[initial_cols].min(axis=1) 

train['exp2_sum_all'] = train['sum_all'] ** 2
test['exp2_sum_all'] = test['sum_all'] ** 2

train['log2_sum_all'] = np.log2(train['sum_all']+1)
test['log2_sum_all'] = np.log2(test['sum_all']+1)

train['exp3_sum_all'] = train['sum_all'] ** 3
test['exp3_sum_all'] = test['sum_all'] ** 3

train['sqrt_sum_all'] = train['sum_all'] ** (1/2)
test['sqrt_sum_all'] = test['sum_all'] ** (1/2)

train['cbrt_sum_all'] = train['sum_all'] ** (1/3)
test['cbrt_sum_all'] = test['sum_all'] ** (1/3)

train['prod_all_over_sum_all'] = train[initial_cols].prod(axis=1) / train['sum_all']
test['prod_all_over_sum_all'] = test[initial_cols].prod(axis=1) / test['sum_all']

train['prod_all_over_exp2_sum_all'] = train[initial_cols].prod(axis=1) / train['exp2_sum_all']
test['prod_all_over_exp2_sum_all'] = test[initial_cols].prod(axis=1) / test['exp2_sum_all']

train['prod_all_over_log2_sum_all'] = train[initial_cols].prod(axis=1) / train['log2_sum_all']
test['prod_all_over_log2_sum_all'] = test[initial_cols].prod(axis=1) / test['log2_sum_all']

In [136]:
for col in initial_cols:
    train[col] = np.log1p(train[col]+1)
    test[col] = np.log1p(test[col]+1)

In [137]:
train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,min_all,range_all,exp2_sum_all,log2_sum_all,exp3_sum_all,sqrt_sum_all,cbrt_sum_all,prod_all_over_sum_all,prod_all_over_exp2_sum_all,prod_all_over_log2_sum_all
0,0,1.94591,2.302585,1.94591,2.302585,2.079442,1.791759,1.791759,1.609438,1.609438,...,2.0,6.0,8836.0,6.569856,830584.0,9.69536,4.546836,77826180000.0,827938100.0,1113519000000.0
1,1,2.079442,2.197225,1.791759,1.791759,2.302585,2.302585,1.609438,1.94591,1.791759,...,0.0,9.0,8836.0,6.569856,830584.0,9.69536,4.546836,0.0,0.0,0.0
2,2,2.079442,1.94591,2.079442,2.197225,1.609438,2.197225,1.098612,1.94591,1.791759,...,1.0,7.0,9801.0,6.643856,970299.0,9.949874,4.626065,101837300000.0,1028660000.0,1517476000000.0
3,3,1.609438,1.791759,2.079442,1.94591,1.791759,2.302585,1.791759,2.197225,2.079442,...,2.0,6.0,10816.0,6.714246,1124864.0,10.198039,4.702669,700302600000.0,6733679000.0,10847300000000.0
4,4,1.94591,1.609438,1.386294,2.079442,1.791759,1.791759,1.609438,1.609438,1.609438,...,1.0,5.0,5184.0,6.189825,373248.0,8.485281,4.160168,279936000.0,3888000.0,3256214000.0


In [138]:
cols_numeric = [x for x in train.columns if x not in ['id', 'FloodProbability', 'over_sum', 'under_sum']]

In [139]:
# Escañar tpdp excepto id y FloodProbability y transformar luego cluser a 0-1
scaler = StandardScaler()
scaler.fit(train[cols_numeric])
train[cols_numeric] = scaler.transform(train[cols_numeric])
test[cols_numeric] = scaler.transform(test[cols_numeric])

In [140]:
# tranform clust in 0-1
for col in train.columns:
    if train[col].dtype == 'bool':
        train[col] = train[col].astype(int)
        test[col] = test[col].astype(int)

### Partición del train en train, test y validation

In [141]:
X = train.drop(columns=['id','FloodProbability'])
y = train.FloodProbability
print(X.shape, y.shape)

(1117957, 52) (1117957,)


In [142]:
train_, val_, y_train_, y_val_ = train_test_split(X, y, test_size=0.2, random_state=42)
val_, test_, y_val_, y_test_ = train_test_split(val_, y_val_, test_size=0.5, random_state=42)

In [144]:
print(f"Dimensiones de train: {train_.shape}{y_train_.shape}\nDimensiones de validation: {val_.shape}{y_val_.shape}\nDimensiones de test: {test_.shape}{y_test_.shape}")

Dimensiones de train: (894365, 52)(894365,)
Dimensiones de validation: (111796, 52)(111796,)
Dimensiones de test: (111796, 52)(111796,)


### Entrenamiento de baseline

#### Linear Regression

In [145]:
# Generamos una regresión lineal
from sklearn.linear_model import LinearRegression

In [146]:
lr = LinearRegression()
lr.fit(train_, y_train_)

In [147]:
y_val_pred = lr.predict(val_)
y_test_pred = lr.predict(test_)

In [148]:
metricas = pd.DataFrame({'Modelo': ['Linear Regression'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})

In [149]:
metricas

Unnamed: 0,Modelo,r2_val,r2_test
0,Linear Regression,0.853215,0.855201


In [150]:
pred_test = lr.predict(test.drop(columns='id'))
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_lr.csv', index=False)

In [151]:
# os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_lr.csv -m "Linear Regression"')

### Based on Trees

#### xgboost

In [152]:
# usar con evallist
cols_to_model = [x for x in train_.columns if x not in ['id']]
xgtrain = xgb.DMatrix(train_[cols_to_model], label=y_train_)
xgval = xgb.DMatrix(val_[cols_to_model], label=y_val_)
xgtest = xgb.DMatrix(test_[cols_to_model])
xgTEST = xgb.DMatrix(test[cols_to_model])

In [158]:
params = { 'learning_rate': 0.04, 'booster': 'dart'}

In [159]:
xgb_model = xgb.train( params, dtrain=xgtrain, num_boost_round=1000, evals=[(xgval, 'eval')], early_stopping_rounds=10, verbose_eval=10)

[0]	eval-rmse:0.04916
[10]	eval-rmse:0.03553
[20]	eval-rmse:0.02740
[30]	eval-rmse:0.02291
[40]	eval-rmse:0.02060
[50]	eval-rmse:0.01949
[60]	eval-rmse:0.01897
[70]	eval-rmse:0.01873
[80]	eval-rmse:0.01862
[90]	eval-rmse:0.01857
[100]	eval-rmse:0.01854
[110]	eval-rmse:0.01853
[120]	eval-rmse:0.01852
[130]	eval-rmse:0.01852
[140]	eval-rmse:0.01851
[150]	eval-rmse:0.01851
[160]	eval-rmse:0.01851
[170]	eval-rmse:0.01851
[180]	eval-rmse:0.01851
[190]	eval-rmse:0.01851
[200]	eval-rmse:0.01851
[210]	eval-rmse:0.01851
[220]	eval-rmse:0.01851
[230]	eval-rmse:0.01851
[240]	eval-rmse:0.01850
[250]	eval-rmse:0.01850
[260]	eval-rmse:0.01850
[269]	eval-rmse:0.01850


In [160]:
y_val_pred = xgb_model.predict(xgval)
y_test_pred = xgb_model.predict(xgtest)

In [161]:
metricas_tmp = pd.DataFrame({'Modelo': ['XGBoost'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, metricas_tmp])
metricas

Unnamed: 0,Modelo,r2_val,r2_test
0,Linear Regression,0.853215,0.855201
0,XGBoost,0.867968,0.869283
0,XGBoost,0.867941,0.869284


In [157]:
pred_test = xgb_model.predict(xgTEST)
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_xgb.csv', index=False)

In [None]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_xgb.csv -m "XGBoost"')

#### XgBoost with hyperparameter tuning

In [None]:
import optuna

In [None]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear"]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "n_estimators": 1000,
        "gamma": trial.suggest_float("gamma", 1e-8, 1.0, log=True),
        "seed": 42,
        "n_jobs": -1,
        "verbosity": 1
    }

    xgb_model = xgb.XGBRegressor(**params, early_stopping_rounds=2)
    xgb_model.fit(train_, y_train_, eval_set=[(val_, y_val_)], verbose=50)
    predictions = xgb_model.predict(test_)
    r2 = r2_score(y_test_, predictions)
    return r2

In [None]:
study = optuna.create_study(direction='maximize')

In [37]:
study.optimize(objective, n_trials=40)

NameError: name 'study' is not defined

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

In [38]:
params = study.best_params
xgb_model = xgb.XGBRegressor(**params)

NameError: name 'study' is not defined

In [39]:
xgb_model.fit(train_, y_train_, eval_set=[(val_, y_val_)])

NameError: name 'xgb_model' is not defined

In [40]:
y_val_pred = xgb_model.predict(val_)
y_test_pred = xgb_model.predict(test_)

NameError: name 'xgb_model' is not defined

In [41]:
tmp_metricas = pd.DataFrame({'Modelo': ['XGBoost GridSearch'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

In [42]:
pred_test = xgb_model.predict(test.drop(columns='id'))
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_xgb_grid.csv', index=False)

NameError: name 'xgb_model' is not defined

In [43]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_xgb_grid.csv -m "XGBoost GridSearch"')

NameError: name 'os' is not defined

#### LightGBM

In [44]:
import lightgbm as lgb

In [45]:
lgb_train = lgb.Dataset(train_, y_train_)
lgb_val = lgb.Dataset(val_, y_val_)
lgb_test = lgb.Dataset(test_, y_test_)

NameError: name 'train_' is not defined

In [46]:
params = {'objective': 'regression', 'metric': 'r2', 'learning_rate': 0.05, 'booster': 'dart', 'seed': 42, 'boosting_type': 'gbdt', 'n_jobs': -1, 'verbosity': 1, 'n_estimators': 1000}
lgb_model = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=[lgb_val])

NameError: name 'lgb_train' is not defined

In [47]:
y_val_pred = lgb_model.predict(val_)
y_test_pred = lgb_model.predict(test_)

NameError: name 'lgb_model' is not defined

In [48]:
tmp_metricas = pd.DataFrame({'Modelo': ['LightGBM'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})

NameError: name 'pd' is not defined

In [49]:
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

In [50]:
pred_test = lgb_model.predict(test.drop(columns='id'))
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_lgb.csv', index=False)

NameError: name 'lgb_model' is not defined

In [51]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_lgb.csv -m "LightGBM"')

NameError: name 'os' is not defined

#### LightGBM with hyperparameter tuning

In [52]:
import optuna

In [53]:
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear"]),
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 256),
        "max_depth": trial.suggest_int("max_depth", 6, 18),
        "n_estimators": 1000,
        "seed": 42,
        "n_jobs": -1,
        "verbosity": 1
    }

    lgb_model = lgb.LGBMRegressor(**params)
    lgb_model.fit(train_, y_train_, eval_set=[(val_, y_val_)])
    predictions = lgb_model.predict(test_)
    r2 = r2_score(y_test_, predictions)
    return r2

In [54]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective, n_trials=40)

[I 2024-05-28 07:48:33,219] A new study created in memory with name: no-name-fb03d722-2018-44e5-8e50-f417caa3c504
[W 2024-05-28 07:48:33,221] Trial 0 failed with parameters: {'booster': 'gblinear', 'learning_rate': 0.007538837502883052, 'num_leaves': 60, 'max_depth': 7} because of the following error: NameError("name 'train_' is not defined").
Traceback (most recent call last):
  File "/home/chidalgo/git/DS_Course/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_20991/3399334388.py", line 17, in objective
    lgb_model.fit(train_, y_train_, eval_set=[(val_, y_val_)])
NameError: name 'train_' is not defined
[W 2024-05-28 07:48:33,224] Trial 0 failed with value None.


NameError: name 'train_' is not defined

In [None]:
print('Best hyperparameters:', study2.best_params)
print('Best RMSE:', study2.best_value)

In [55]:
params = study2.best_params

ValueError: No trials are completed yet.

In [56]:
lgb_model = lgb.LGBMRegressor(**params)

In [57]:
lgb_model.fit(train_, y_train_, eval_set=[(val_, y_val_)])

NameError: name 'train_' is not defined

In [58]:
y_val_pred = lgb_model.predict(val_)
y_test_pred = lgb_model.predict(test_)

NameError: name 'val_' is not defined

In [59]:
tmp_metricas = pd.DataFrame({'Modelo': ['LightGBM GridSearch'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

In [60]:
pred_test = lgb_model.predict(test.drop(columns='id'))
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_lgb_grid.csv', index=False)

NameError: name 'test' is not defined

In [61]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_lgb_grid.csv -m "LightGBM GridSearch"')

NameError: name 'os' is not defined

#### catboost

In [62]:
import catboost as cb

In [63]:
cb_model = cb.CatBoostRegressor(iterations=800, learning_rate=0.1, depth=7, loss_function='RMSE', verbose=10, random_state=42, 
                                eval_metric='R2', devices='0:1', early_stopping_rounds=10)

In [64]:
cb_model.fit(train_, y_train_, eval_set=(val_, y_val_), verbose=10)

NameError: name 'train_' is not defined

In [65]:
y_val_pred = cb_model.predict(val_)
y_test_pred = cb_model.predict(test_)

NameError: name 'val_' is not defined

In [66]:
tmp_metricas = pd.DataFrame({'Modelo': ['CatBoost'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
tmp_metricas
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

In [67]:
pred_test = cb_model.predict(test)
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_cb.csv', index=False)

NameError: name 'test' is not defined

In [68]:
#os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_cb.csv -m "CatBoost"')

#### CatBoost with hyperparameter tuning

In [69]:
import optuna
from sklearn.metrics import mean_squared_error

In [70]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        "depth": trial.suggest_int("depth", 6, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel",0.7, 0.9),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 50),
    }

    model = cb.CatBoostRegressor(**params, silent=True)
    model.fit(train_, y_train_, eval_set=(val_, y_val_), verbose=10)
    predictions = model.predict(test_)
    rmse = mean_squared_error(y_test_, predictions, squared=False)
    return rmse

In [71]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2024-05-28 07:48:35,199] A new study created in memory with name: no-name-039df4bb-e31a-4325-a641-31ac4f52152d
[W 2024-05-28 07:48:35,201] Trial 0 failed with parameters: {'learning_rate': 0.03372200446289337, 'depth': 11, 'subsample': 0.9742289558870686, 'colsample_bylevel': 0.7151225845084223, 'min_data_in_leaf': 5} because of the following error: NameError("name 'train_' is not defined").
Traceback (most recent call last):
  File "/home/chidalgo/git/DS_Course/venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_20991/4289174437.py", line 12, in objective
    model.fit(train_, y_train_, eval_set=(val_, y_val_), verbose=10)
NameError: name 'train_' is not defined
[W 2024-05-28 07:48:35,204] Trial 0 failed with value None.


NameError: name 'train_' is not defined

In [72]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

ValueError: No trials are completed yet.

In [73]:
cb_model = cb.CatBoostRegressor(iterations=1000, learning_rate=study.best_params['learning_rate'], depth=study.best_params['depth'],
                                subsample=study.best_params['subsample'], colsample_bylevel=study.best_params['colsample_bylevel'],
                                min_data_in_leaf=study.best_params['min_data_in_leaf'], loss_function='RMSE', verbose=10, random_state=42, 
                                eval_metric='RMSE' , devices='0:1', early_stopping_rounds=10)

ValueError: No trials are completed yet.

In [74]:
cb_model.fit(train_, y_train_, eval_set=(val_, y_val_), verbose=10)

NameError: name 'train_' is not defined

In [75]:
y_val_pred = cb_model.predict(val_)
y_test_pred = cb_model.predict(test_)

NameError: name 'val_' is not defined

In [76]:
tmp_metricas = pd.DataFrame({'Modelo': ['CatBoost GridSearch'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

In [77]:
pred_test = cb_model.predict(test)
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test}).to_csv('data/submissions/submissions_cb_grid.csv', index=False)

NameError: name 'test' is not defined

In [78]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_cb_grid.csv -m "CatBoost GridSearch"')

NameError: name 'os' is not defined

### Adaboost

In [79]:
from sklearn.ensemble import AdaBoostRegressor

In [80]:
ada = AdaBoostRegressor(n_estimators=100, learning_rate=0.05)

In [81]:
ada.fit(train_, y_train_)

NameError: name 'train_' is not defined

In [82]:
y_val_pred = ada.predict(val_)
y_test_pred = ada.predict(test_)

NameError: name 'val_' is not defined

In [83]:
tmp_metricas = pd.DataFrame({'Modelo': ['AdaBoost'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, tmp_metricas])
metricas

NameError: name 'pd' is not defined

#### Random Forest

In [84]:
from sklearn.ensemble import RandomForestRegressor

In [85]:
rf = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42)
rf.fit(train_, y_train_)

NameError: name 'train_' is not defined

### Modelos lineales

In [86]:
from sklearn.linear_model import Ridge, Lasso

In [87]:
ridge = Ridge()

In [88]:
ridge.fit(train_, y_train_)

NameError: name 'train_' is not defined

In [89]:
y_val_pred = ridge.predict(val_)

NameError: name 'val_' is not defined

In [90]:
y_test_pred = ridge.predict(test_)

NameError: name 'test_' is not defined

In [91]:
metricas_tmp = pd.DataFrame({'Modelo': ['Ridge'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})

NameError: name 'pd' is not defined

In [92]:
metricas = pd.concat([metricas, metricas_tmp])

NameError: name 'pd' is not defined

In [93]:
metricas

NameError: name 'metricas' is not defined

In [94]:
lasso = Lasso()

In [95]:
lasso.fit(train_, y_train_)

NameError: name 'train_' is not defined

In [96]:
y_val_pred = lasso.predict(val_)

NameError: name 'val_' is not defined

In [97]:
y_test_pred = lasso.predict(test_)

NameError: name 'test_' is not defined

In [98]:
metricas_tmp = pd.DataFrame({'Modelo': ['Lasso'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})

NameError: name 'pd' is not defined

In [99]:
metricas = pd.concat([metricas, metricas_tmp])

NameError: name 'pd' is not defined

In [100]:
metricas

NameError: name 'metricas' is not defined

In [101]:
pred_test_ridge = ridge.predict(test)

NameError: name 'test' is not defined

In [102]:
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test_ridge}).to_csv('data/submissions/submissions_ridge.csv', index=False)

NameError: name 'pd' is not defined

In [103]:
# Elastic Net
from sklearn.linear_model import ElasticNet

In [104]:
en = ElasticNet()

In [105]:
en.fit(train_, y_train_)

NameError: name 'train_' is not defined

In [106]:
y_val_pred = en.predict(val_)

NameError: name 'val_' is not defined

In [107]:
y_test_pred = en.predict(test_)

NameError: name 'test_' is not defined

In [108]:
metricas_tmp = pd.DataFrame({'Modelo': ['Elastic Net'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})

NameError: name 'pd' is not defined

In [109]:
metricas = pd.concat([metricas, metricas_tmp])

NameError: name 'pd' is not defined

In [110]:
metricas

NameError: name 'metricas' is not defined

In [111]:
pred_test_en = en.predict(test)

NameError: name 'test' is not defined

In [112]:
submissions = pd.DataFrame({'id': test.id, 'FloodProbability': pred_test_en}).to_csv('data/submissions/submissions_en.csv', index=False)

NameError: name 'pd' is not defined

In [113]:
os.system('kaggle competitions submit -c playground-series-s4e5 -f data/submissions/submissions_en.csv -m "Elastic Net"')

NameError: name 'os' is not defined

### Redes Neuronales

In [114]:
# Redes neuronales
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [115]:
model = Sequential()
model.add(Dense(64, input_dim=train_.shape[1], activation='elu'))
model.add(Dense(32, activation='elu'))
model.add(Dense(16, activation='elu'))
model.add(Dense(8, activation='elu'))
model.add(Dense(4, activation='elu'))
model.add(Dense(3, activation='elu'))
model.add(Dense(2, activation='elu'))
model.add(Dense(1, activation='linear'))

NameError: name 'train_' is not defined

In [116]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])

In [117]:
model.fit(train_, y_train_, epochs=10, batch_size=32, validation_data=(val_, y_val_))

NameError: name 'train_' is not defined

In [118]:
y_val_pred = model.predict(val_)
y_test_pred = model.predict(test_)

NameError: name 'val_' is not defined

In [119]:
metricas_tmp = pd.DataFrame({'Modelo': ['Redes Neuronales'], 'r2_val': [r2_score(y_val_, y_val_pred)], 'r2_test': [r2_score(y_test_, y_test_pred)]})
metricas = pd.concat([metricas, metricas_tmp])
metricas

NameError: name 'pd' is not defined