In [1]:
RANDOM_STATE = 42
na_filling = "imputer"
scaling = False
overSample = True

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

import random
import numpy as np
import pandas as pd


from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [3]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./test.csv")

columns = list(train_df.columns)

features = [
    'Salinity_today', 
    'Temperature_today', 
    'Substrate', 
    'Depth', 
    'Exposure', 

    'Temperature_today_exp', 
    'Depth_log', 
    'Exposure_log', 
    'Salin_div_depth',
    'Temp_div_depth',
    
]

categoricals = [
    'Substrate',
]

numerical_features = [f for f in features if f not in categoricals]

target = 'Presence'


# init
for f in features:
    if f not in train_df:
        train_df[f] = 0.0
        test_df[f]  = 0.0

# Fill Na

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


if na_filling == "imputer":
    imputer = IterativeImputer(max_iter = 10, random_state = RANDOM_STATE)
    imputer.fit(train_df[features])
    train_df[features] = pd.DataFrame(imputer.transform(train_df[features]), columns = features)
    test_df[features] = pd.DataFrame(imputer.transform(test_df[features]), columns = features)

else:
    train_df[numerical_features] = train_df[numerical_features].fillna(train_df[numerical_features].median())
    test_df[numerical_features] = test_df[numerical_features].fillna(test_df[numerical_features].median())

    train_df[categoricals] = train_df[categoricals].fillna(train_df[categoricals].mode().iloc[0])
    test_df[categoricals] = test_df[categoricals].fillna(test_df[categoricals].mode().iloc[0])

# More features

In [5]:
train_df['Exposure_log'] = np.log(train_df['Exposure'])
test_df['Exposure_log']  = np.log(test_df['Exposure'])

train_df['Depth_log'] = np.log(np.abs(train_df['Depth']))
test_df['Depth_log'] = np.log(np.abs(test_df['Depth']))

train_df['Temperature_today_exp'] = np.exp(train_df['Temperature_today'])
test_df['Temperature_today_exp'] = np.exp(test_df['Temperature_today'])

train_df['Temp_div_depth'] = train_df['Temperature_today'] / train_df['Depth']
test_df['Temp_div_depth'] = test_df['Temperature_today']   / test_df['Depth']

train_df['Salin_div_depth'] = train_df['Salinity_today'] / train_df['Depth']
test_df['Salin_div_depth'] = test_df['Salinity_today']   / test_df['Depth']

In [6]:
features = set(features)
for df in [train_df, test_df]:
    for i in range(2, 7):
        new_feature = f'Temperature_today^{i}'
        df[new_feature] = df['Temperature_today'] ** i
        features.add(new_feature)
        
features = list(features)

In [7]:
train_df.head(5)

Unnamed: 0,pointid,Salinity_today,Temperature_today,Substrate,Depth,Exposure,Presence,Temperature_today_exp,Depth_log,Exposure_log,Salin_div_depth,Temp_div_depth,Temperature_today^2,Temperature_today^3,Temperature_today^4,Temperature_today^5,Temperature_today^6
0,1557521,30.467175,6.472158,1.0,-124.81,972065.25,0,646.878444,4.826793,13.787178,-0.244108,-0.051856,41.888834,271.111171,1754.674444,11356.530939,73501.267114
1,893106,5.571699,3.367225,0.0,-6.122131,19108.832,0,28.997952,1.81191,9.857906,-0.910091,-0.550009,11.338206,38.178291,128.554905,432.873316,1457.581937
2,1326854,6.657795,5.305255,1.0,-162.64,772179.3,0,201.392431,5.091539,13.556972,-0.040936,-0.03262,28.145735,149.320312,792.182391,4202.729906,22296.555531
3,196477,2.744422,1.934046,0.0,-35.44,407472.4,0,6.917444,3.567841,12.917728,-0.077439,-0.054572,3.740535,7.234369,13.991606,27.060415,52.336097
4,168448,2.797321,2.039138,1.0,-51.25,408049.12,0,7.683983,3.936716,12.919143,-0.054582,-0.039788,4.158084,8.478907,17.289661,35.256004,71.891858


# Scaler

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df = pd.concat([train_df[numerical_features], test_df[numerical_features]], ignore_index=True)
scaler.fit(df[numerical_features])

# x_test = test_df.copy()[features]
# x_test[numerical_features] = scaler.transform(x_test[numerical_features])

StandardScaler(copy=True, with_mean=True, with_std=True)

# LightGBM

In [16]:
import lightgbm
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, mean_squared_error, mean_absolute_error, f1_score

In [35]:
parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': -1
}

models = []

for turn_i in range(5):
    ros = RandomOverSampler()
    _, _ = ros.fit_resample(train_df[features], train_df.Presence)
    train_idx = ros.sample_indices_
        
    resampled_df = shuffle(train_df.loc[train_idx])
    x_train = resampled_df[features][10000:]
    x_val   = resampled_df[features][:10000]
    y_train = resampled_df[target][10000:]
    y_val   = resampled_df[target][:10000]
    
    train_data = lightgbm.Dataset(x_train, feature_name=features, label=y_train, categorical_feature=categoricals)
    val_data = lightgbm.Dataset(x_val, feature_name=features, label=y_val, categorical_feature=categoricals)
    
    model = lightgbm.train(parameters,
                   train_data,
                   valid_sets=val_data,
                   num_boost_round=5000,
                   early_stopping_rounds=100)

    
    val_pred = model.predict(x_val)
    score = roc_auc_score(y_val, val_pred)
    models.append([score, turn_i])
    model.save_model(f"model_{turn_i}.lgbm")
    print('Turn: {} score: {}'.format(turn_i, score))



[1]	valid_0's auc: 0.997799
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999728
[3]	valid_0's auc: 0.9997
[4]	valid_0's auc: 0.99972
[5]	valid_0's auc: 0.99972
[6]	valid_0's auc: 0.99972
[7]	valid_0's auc: 0.999736
[8]	valid_0's auc: 0.999874
[9]	valid_0's auc: 0.999874
[10]	valid_0's auc: 0.999895
[11]	valid_0's auc: 0.999895
[12]	valid_0's auc: 0.999899
[13]	valid_0's auc: 0.999899
[14]	valid_0's auc: 0.99995
[15]	valid_0's auc: 0.99995
[16]	valid_0's auc: 0.999954
[17]	valid_0's auc: 0.999954
[18]	valid_0's auc: 0.999954
[19]	valid_0's auc: 1
[20]	valid_0's auc: 1
[21]	valid_0's auc: 1
[22]	valid_0's auc: 1
[23]	valid_0's auc: 1
[24]	valid_0's auc: 1
[25]	valid_0's auc: 1
[26]	valid_0's auc: 1
[27]	valid_0's auc: 1
[28]	valid_0's auc: 1
[29]	valid_0's auc: 1
[30]	valid_0's auc: 1
[31]	valid_0's auc: 1
[32]	valid_0's auc: 1
[33]	valid_0's auc: 1
[34]	valid_0's auc: 1
[35]	valid_0's auc: 1
[36]	valid_0's auc: 1
[37]	valid_0's auc: 1
[38]	valid_0's



[1]	valid_0's auc: 0.997728
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999784
[3]	valid_0's auc: 0.999688
[4]	valid_0's auc: 0.999844
[5]	valid_0's auc: 0.999844
[6]	valid_0's auc: 1
[7]	valid_0's auc: 1
[8]	valid_0's auc: 1
[9]	valid_0's auc: 1
[10]	valid_0's auc: 1
[11]	valid_0's auc: 1
[12]	valid_0's auc: 1
[13]	valid_0's auc: 1
[14]	valid_0's auc: 1
[15]	valid_0's auc: 1
[16]	valid_0's auc: 1
[17]	valid_0's auc: 1
[18]	valid_0's auc: 1
[19]	valid_0's auc: 1
[20]	valid_0's auc: 1
[21]	valid_0's auc: 1
[22]	valid_0's auc: 1
[23]	valid_0's auc: 1
[24]	valid_0's auc: 1
[25]	valid_0's auc: 1
[26]	valid_0's auc: 1
[27]	valid_0's auc: 1
[28]	valid_0's auc: 1
[29]	valid_0's auc: 1
[30]	valid_0's auc: 1
[31]	valid_0's auc: 1
[32]	valid_0's auc: 1
[33]	valid_0's auc: 1
[34]	valid_0's auc: 1
[35]	valid_0's auc: 1
[36]	valid_0's auc: 1
[37]	valid_0's auc: 1
[38]	valid_0's auc: 1
[39]	valid_0's auc: 1
[40]	valid_0's auc: 1
[41]	valid_0's auc: 1
[42]	valid



[1]	valid_0's auc: 0.998012
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999894
[3]	valid_0's auc: 0.999846
[4]	valid_0's auc: 0.999846
[5]	valid_0's auc: 0.999846
[6]	valid_0's auc: 0.999846
[7]	valid_0's auc: 0.999846
[8]	valid_0's auc: 0.999846
[9]	valid_0's auc: 0.999846
[10]	valid_0's auc: 0.999846
[11]	valid_0's auc: 0.999849
[12]	valid_0's auc: 0.999874
[13]	valid_0's auc: 0.999874
[14]	valid_0's auc: 0.999882
[15]	valid_0's auc: 0.999882
[16]	valid_0's auc: 0.999882
[17]	valid_0's auc: 0.999936
[18]	valid_0's auc: 0.999936
[19]	valid_0's auc: 0.999939
[20]	valid_0's auc: 0.999931
[21]	valid_0's auc: 0.999903
[22]	valid_0's auc: 0.999903
[23]	valid_0's auc: 0.999903
[24]	valid_0's auc: 0.999903
[25]	valid_0's auc: 0.999917
[26]	valid_0's auc: 1
[27]	valid_0's auc: 1
[28]	valid_0's auc: 1
[29]	valid_0's auc: 1
[30]	valid_0's auc: 1
[31]	valid_0's auc: 1
[32]	valid_0's auc: 1
[33]	valid_0's auc: 1
[34]	valid_0's auc: 1
[35]	valid_0's auc: 1
[3



[1]	valid_0's auc: 0.998545
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999788
[3]	valid_0's auc: 0.999692
[4]	valid_0's auc: 0.999846
[5]	valid_0's auc: 0.999846
[6]	valid_0's auc: 0.999846
[7]	valid_0's auc: 0.999846
[8]	valid_0's auc: 0.999853
[9]	valid_0's auc: 0.999853
[10]	valid_0's auc: 0.999853
[11]	valid_0's auc: 0.999853
[12]	valid_0's auc: 0.999849
[13]	valid_0's auc: 0.999849
[14]	valid_0's auc: 0.999853
[15]	valid_0's auc: 0.999853
[16]	valid_0's auc: 0.999931
[17]	valid_0's auc: 0.999931
[18]	valid_0's auc: 0.999931
[19]	valid_0's auc: 0.999937
[20]	valid_0's auc: 0.99992
[21]	valid_0's auc: 0.9999
[22]	valid_0's auc: 0.999896
[23]	valid_0's auc: 0.999917
[24]	valid_0's auc: 0.999917
[25]	valid_0's auc: 0.999921
[26]	valid_0's auc: 0.999921
[27]	valid_0's auc: 0.999921
[28]	valid_0's auc: 0.999921
[29]	valid_0's auc: 0.999921
[30]	valid_0's auc: 0.999921
[31]	valid_0's auc: 0.999921
[32]	valid_0's auc: 0.999921
[33]	valid_0's auc: 0.



[1]	valid_0's auc: 0.998524
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.999841
[3]	valid_0's auc: 0.999979
[4]	valid_0's auc: 1
[5]	valid_0's auc: 1
[6]	valid_0's auc: 1
[7]	valid_0's auc: 1
[8]	valid_0's auc: 1
[9]	valid_0's auc: 1
[10]	valid_0's auc: 1
[11]	valid_0's auc: 1
[12]	valid_0's auc: 1
[13]	valid_0's auc: 1
[14]	valid_0's auc: 1
[15]	valid_0's auc: 1
[16]	valid_0's auc: 1
[17]	valid_0's auc: 1
[18]	valid_0's auc: 1
[19]	valid_0's auc: 1
[20]	valid_0's auc: 1
[21]	valid_0's auc: 1
[22]	valid_0's auc: 1
[23]	valid_0's auc: 1
[24]	valid_0's auc: 1
[25]	valid_0's auc: 1
[26]	valid_0's auc: 1
[27]	valid_0's auc: 1
[28]	valid_0's auc: 1
[29]	valid_0's auc: 1
[30]	valid_0's auc: 1
[31]	valid_0's auc: 1
[32]	valid_0's auc: 1
[33]	valid_0's auc: 1
[34]	valid_0's auc: 1
[35]	valid_0's auc: 1
[36]	valid_0's auc: 1
[37]	valid_0's auc: 1
[38]	valid_0's auc: 1
[39]	valid_0's auc: 1
[40]	valid_0's auc: 1
[41]	valid_0's auc: 1
[42]	valid_0's auc: 1
[4

# Predict proba by best model

In [41]:
models.sort(key=lambda x: x[0])
best_model_i = models[-1][1]
best_model = lightgbm.Booster(model_file=f"model_{best_model_i}.lgbm")

y_probas = best_model.predict(test_df[features])

result = pd.read_csv("temperature_submission.csv")
result.Presence = y_probas.round(3)
result.to_csv("submission.csv", index=False)

# Ensemble :)

In [43]:
result = pd.read_csv("temperature_submission.csv")
result.Presence = 0

for i in range(len(models)):
    model = lightgbm.Booster(model_file=f"model_{i}.lgbm")
    y_probas = model.predict(test_df[features])
    result.Presence += y_probas

result.Presence = (result.Presence / len(models)).round(3)
print(f"Found killers: {sum(result.Presence)}")
result.to_csv("submission.csv", index=False)

Found killers: 68553.26599961848


In [None]:
!kaggle competitions submit -c killer-shrimp-invasion -f submission.csv -m "Message"

https://catboost.ai/docs/concepts/python-reference_pool.html

https://catboost.ai/docs/concepts/python-reference_parameters-list.html