In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import pickle

pd.set_option("display.max_columns", 500)

random_seed = 2024
np.random.seed(random_seed)


In [61]:
# reading in data and dividing into x and y
train = pd.read_csv('../data/post_fs_train.csv')
test = pd.read_csv('../data/post_fs_test.csv')

y_train = train['R_SALINITY']
y_test = test['R_SALINITY']

x_train = train.drop('R_SALINITY', axis=1)
x_test = test.drop('R_SALINITY', axis=1)

In [62]:
data = pd.read_excel('../scores/mutual_info_score.xlsx')
data.head(10)

Unnamed: 0.1,Unnamed: 0,mutual_info_score
0,R_SIGMA,1.006977
1,R_O2Sat,1.006948
2,R_SVA,0.966844
3,R_O2,0.925508
4,R_O2_sqrt,0.923673
5,R_PO4,0.90174
6,R_SIO3,0.853149
7,R_PRES,0.813435
8,R_NO3,0.812659
9,R_Depth,0.808644


In [88]:
columns = data['Unnamed: 0']
#columns = columns[:10]
print(columns)

0                      R_SIGMA
1                      R_O2Sat
2                        R_SVA
3                         R_O2
4                    R_O2_sqrt
                ...           
63    R_PHAEO_cat_(1.395, inf]
64      R_Depth_cat_(0.0, 0.5]
65      R_NH4_cat_(0.0, 0.005]
66              R_PRES_missing
67             R_Depth_missing
Name: Unnamed: 0, Length: 68, dtype: object


In [64]:
#small data set
x_train_sm = x_train[columns]
x_train_sm.columns

Index(['R_SIGMA', 'R_O2Sat', 'R_SVA', 'R_O2', 'R_O2_sqrt', 'R_PO4', 'R_SIO3',
       'R_PRES', 'R_NO3', 'R_Depth'],
      dtype='object')

In [84]:
import lightgbm as lgb
from lightgbm import LGBMRegressor, LGBMClassifier
from lightgbm import cv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

train_set = lgb.Dataset(data = x_train, label = y_train, free_raw_data=False)
test_set = lgb.Dataset(data = x_test, label = y_test, free_raw_data=False)


In [87]:
model = LGBMRegressor()
default_params = model.get_params()
print(default_params)
print(x_train)
scores = cross_val_score(model, x_train_sm, y_train,scoring = "neg_mean_squared_error",cv =5,verbose=1)
#reg = model.fit(x_train, y_train)
print('MSE:', -np.mean(scores))


{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': None, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}
<lightgbm.basic.Dataset object at 0x000002223C3597D0>
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.022035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 326778, number of used features: 10
[LightGBM] [Info] Start training from score 33.820413
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

# Hyperparameter tunning using randomised grid search

In [78]:
from sklearn.model_selection import RandomizedSearchCV

params = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'n_estimators':[10, 20, 40, 70, 100],
    'max_depth': [3, 4, 6, 8, 10],
    'num_leaves': [5, 20, 50, 100],
    'learning_rate': [0.01, 0.05, 0.20, 0.03, 0.45, 0.85, 0.6, 0.75, 1],
    'min_child_samples': list(range(20, 500, 10)),
    'reg_alpha': [0.001, 0.0007, 0.1, 0.03, 0.06, 0.8, 0.65, 0.3, 1],
    'reg_lambda': [0.01, 0.1, 1, 0.6, 0.006, 0.3, 0.2],
}

reg = RandomizedSearchCV(model, params, scoring='neg_mean_squared_error', n_iter=15,random_state=42)
reg.fit(x_train_sm, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 326778, number of used features: 10
[LightGBM] [Info] Start training from score 33.820413
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 326778, number of used features: 10
[LightGBM] [Info] Start training from score 33.820380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010986 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 326778, number of used features: 10
[LightGBM] [Info] Star

In [80]:
print('MSE:',-reg.best_score_)

0.006755651364708852


In [81]:
reg.predict(x_train)

ValueError: Number of features of the model must match the input. Model n_features_ is 10 and input n_features is 35

# Test on full dataset

In [None]:
filename = "../models/lasso.sav"
pickle.dump(reg, open(filename, "wb"))

In [None]:
list(np.linspace(0, 1))

[0.0,
 0.02040816326530612,
 0.04081632653061224,
 0.061224489795918366,
 0.08163265306122448,
 0.1020408163265306,
 0.12244897959183673,
 0.14285714285714285,
 0.16326530612244897,
 0.18367346938775508,
 0.2040816326530612,
 0.22448979591836732,
 0.24489795918367346,
 0.26530612244897955,
 0.2857142857142857,
 0.3061224489795918,
 0.32653061224489793,
 0.3469387755102041,
 0.36734693877551017,
 0.3877551020408163,
 0.4081632653061224,
 0.42857142857142855,
 0.44897959183673464,
 0.4693877551020408,
 0.4897959183673469,
 0.5102040816326531,
 0.5306122448979591,
 0.5510204081632653,
 0.5714285714285714,
 0.5918367346938775,
 0.6122448979591836,
 0.6326530612244897,
 0.6530612244897959,
 0.673469387755102,
 0.6938775510204082,
 0.7142857142857142,
 0.7346938775510203,
 0.7551020408163265,
 0.7755102040816326,
 0.7959183673469387,
 0.8163265306122448,
 0.836734693877551,
 0.8571428571428571,
 0.8775510204081632,
 0.8979591836734693,
 0.9183673469387754,
 0.9387755102040816,
 0.95918367346