In [30]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xlrd
import random
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

In [3]:
data = pd.read_csv("./data/updated_compare_all.csv")

In [4]:
data.info()
data.describe()
data.shape
print(data.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2110 entries, 0 to 2109
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TYPE                 2090 non-null   object 
 1   STATION_NAME         2110 non-null   object 
 2   BODY_LEVEL           2110 non-null   object 
 3   SAMPLE_DATE          2110 non-null   object 
 4   PERIOD               2110 non-null   object 
 5   TIME                 2110 non-null   object 
 6   STATION_ID           2110 non-null   int64  
 7   CHL_a                2110 non-null   object 
 8   CHL_a NetCDF (μg/L)  2110 non-null   object 
 9   albedo_01            2110 non-null   float64
 10  albedo_02            2110 non-null   float64
 11  albedo_03            2110 non-null   float64
 12  albedo_04            2110 non-null   float64
 13  2km_albedo_01        2110 non-null   float64
 14  2km_albedo_02        2110 non-null   float64
 15  2km_albedo_03        2110 non-null   f

### output filtered csv

In [12]:
# 選擇所需的列
selected_columns = ['2km_albedo_01', '2km_albedo_02', '2km_albedo_03', '2km_albedo_04', 'CHL_a']

# 過濾數據
filtered_data = data[selected_columns]

# 重命名列，去掉 '2km_'
filtered_data.columns = ['albedo_01', 'albedo_02', 'albedo_03', 'albedo_04', 'CHL_a']

# 將過濾後的數據寫入新的CSV文件
filtered_data.to_csv("filtered_data.csv", index=False)

print("新的 CSV 文件已成功輸出，文件名為 filtered_data.csv")

新的 CSV 文件已成功輸出，文件名為 filtered_data.csv


In [13]:
data = pd.read_csv("./filtered_data.csv")
data.info()
data.describe()
data.shape
print(data.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2110 entries, 0 to 2109
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   albedo_01  2110 non-null   float64
 1   albedo_02  2110 non-null   float64
 2   albedo_03  2110 non-null   float64
 3   albedo_04  2110 non-null   float64
 4   CHL_a      2110 non-null   object 
dtypes: float64(4), object(1)
memory usage: 82.5+ KB
Index(['albedo_01', 'albedo_02', 'albedo_03', 'albedo_04', 'CHL_a'], dtype='object')


In [16]:
print(data.CHL_a.count())
print(data.albedo_01.count())
print(data.albedo_02.count())
print(data.albedo_03.count())
print(data.albedo_04.count())

2110
2110
2110
2110
2110


In [18]:
train = data
train.info()
train.CHL_a.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2110 entries, 0 to 2109
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   albedo_01  2110 non-null   float64
 1   albedo_02  2110 non-null   float64
 2   albedo_03  2110 non-null   float64
 3   albedo_04  2110 non-null   float64
 4   CHL_a      2110 non-null   object 
dtypes: float64(4), object(1)
memory usage: 82.5+ KB


CHL_a
<2.4    166
<0.1    151
0.3     146
0.2     146
0.4     137
       ... 
10.8      1
12.1      1
8         1
16.3      1
16.4      1
Name: count, Length: 127, dtype: int64

In [22]:
X = train.drop('CHL_a', axis=1)
y = train.CHL_a
y.value_counts()

CHL_a
<2.4    166
<0.1    151
0.3     146
0.2     146
0.4     137
       ... 
10.8      1
12.1      1
8         1
16.3      1
16.4      1
Name: count, Length: 127, dtype: int64

### Scalar

In [23]:
scaler = StandardScaler()
# scale the data to make it easier for the model to learn
X = scaler.fit_transform(X)

### Train

In [35]:
test_ratio = 0.2

random_seed = random.randint(1, 100)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_ratio, random_state=42)

param_grid = {
    'hidden_layer_sizes': [(100,), (150, 100, 50), (200, 100)],
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'learning_rate_init': [0.001, 0.01, 0.1],
    'activation': ['tanh', 'relu']
}

In [36]:
# Create an MLPRegressor instance
mlp = MLPRegressor(solver='sgd', max_iter=200,
                   n_iter_no_change=10, tol=0.0001, verbose=1)

# Create a GridSearchCV instance
grid_mlp = GridSearchCV(
    mlp, param_grid, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1, cv=5)

# Fit the grid search to the data
grid_mlp.fit(X_train, y_train)

# Get the best estimator
optimized_mlp = grid_mlp.best_estimator_

Fitting 5 folds for each of 72 candidates, totalling 360 fits


ValueError: 
All the 360 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 753, in fit
    return self._fit(X, y, incremental=False)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 442, in _fit
    X, y = self._validate_input(X, y, incremental, reset=first_pass)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 1615, in _validate_input
    X, y = self._validate_data(
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1162, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1187, in _check_y
    y = y.astype(np.float64)
ValueError: could not convert string to float: '<0.1'

--------------------------------------------------------------------------------
288 fits failed with the following error:
Traceback (most recent call last):
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 753, in fit
    return self._fit(X, y, incremental=False)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 442, in _fit
    X, y = self._validate_input(X, y, incremental, reset=first_pass)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 1615, in _validate_input
    X, y = self._validate_data(
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1162, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "/home/louis/anaconda3/envs/big_data/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1187, in _check_y
    y = y.astype(np.float64)
ValueError: could not convert string to float: '<2.4'


In [31]:
print("MLP best parameters:", grid_mlp.best_params_)

# Make predictions
MLP_train_pred = optimized_mlp.predict(X_train)
MLP_test_pred = optimized_mlp.predict(X_test)

# Calculate the desired metrics
train_mae = mean_absolute_error(y_train, MLP_train_pred)
test_mae = mean_absolute_error(y_test, MLP_test_pred)
train_rmse = sqrt(mean_squared_error(y_train, MLP_train_pred))
test_rmse = sqrt(mean_squared_error(y_test, MLP_test_pred))
train_r2 = r2_score(y_train, MLP_train_pred)
test_r2 = r2_score(y_test, MLP_test_pred)



MLP best parameters: {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.1}


ValueError: could not convert string to float: '<2.4'