In [1]:
#!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier , IsolationForest , BaggingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score , RandomizedSearchCV , StratifiedKFold, cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import log_loss, make_scorer, accuracy_score

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
train = pd.read_csv("https://raw.githubusercontent.com/Yousifshaheen/INEGI-gcim-vegetation-mapping-/refs/heads/main/Train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Yousifshaheen/INEGI-gcim-vegetation-mapping-/refs/heads/main/Test.csv")
ss = pd.read_csv("https://raw.githubusercontent.com/Yousifshaheen/INEGI-gcim-vegetation-mapping-/refs/heads/main/SampleSubmission.csv")

train.drop(columns = ['id'], inplace=True)
id = test['id']
test.drop(columns = ['id'], inplace=True)

train['Target'].mod(1)

# replace any missing value with the previos values
train = train.fillna(method='ffill')
test = test.fillna(method='ffill')

### Feature Engineering

In [8]:
def feature_engineering(df):
# PCAs
  df['PCAs_mean'] = (df['PCA1'] + df['PCA2'] + df['PCA3']) / 3
  df['REFLEC_mean'] = (df['REFLEC1'] + df['REFLEC2'] + df['REFLEC3'] + df['REFLEC4'] + df['REFLEC6']) / 5
  df['TEMP_mean'] = (df['TMPMIN1'] + df['TMPMAX1'] ) / 2
  df['NDVIs_mean'] = (df['NDVI1'] + df['NDVI2'] + df['NDVI3'] + df['NDVI4']) / 4
  df['PCA_with_REFLEC2_mean'] = df['PCA2'] + df['REFLEC2']
  df['TRI1_with_ROUGH1_mean'] = df['TRI1'] + df['ROUGH1']
  df['PEND1_with_ROUGH1_mean'] = df['PEND1'] + df['ROUGH1']
  df['PCA_with_REFLEC2_mean_and_TRI1_with_ROUGH1_mean_and_PEND1_with_ROUGH1_mean_mean'] = (df['PCA_with_REFLEC2_mean'] + df['TRI1_with_ROUGH1_mean'] + df['PEND1_with_ROUGH1_mean']) / 3

feature_engineering(train)
feature_engineering(test)

# Drop useless features
def drop_features(df,feature):
  df.drop(columns = f'{feature}', inplace=True)
drop_features(train, 'PCA1')
drop_features(train, 'PCA3')
drop_features(train, 'REFLEC1')
drop_features(train, 'REFLEC2')
drop_features(train, 'REFLEC3')
drop_features(train, 'REFLEC4')
drop_features(train, 'REFLEC6')
drop_features(train, 'TRI1')
drop_features(train, 'ROUGH1')
# for test
drop_features(test, 'PCA1')
drop_features(test, 'PCA3')
drop_features(test, 'REFLEC1')
drop_features(test, 'REFLEC2')
drop_features(test, 'REFLEC3')
drop_features(test, 'REFLEC4')
drop_features(test, 'REFLEC6')
drop_features(test, 'TRI1')
drop_features(test, 'ROUGH1')

## Modeling

In [10]:
X_train = train.copy()
X_test = test.copy()
X_train = X_train.drop('Target', axis=1)

y_train = train['Target']
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
is_inlier = isolation_forest.fit_predict(X_train)


X_train_cleaned = X_train[is_inlier == 1]
y_train_cleaned = y_train[is_inlier == 1]

le = LabelEncoder()
y_train = le.fit_transform(y_train)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_cleaned , y_train_cleaned, test_size=0.2, random_state=42 )


logreg = LogisticRegression(tol = 0.0000005, max_iter=500)
logreg_model = logreg.fit(X_train_cleaned, y_train_cleaned)

In [13]:
skf = StratifiedKFold(n_splits=5,shuffle=True ,random_state=42)

X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)


for train_index, valid_index in skf.split(X_train_cleaned, y_train_cleaned):

    X_train_fold, X_valid_fold = X_train_cleaned.iloc[train_index], X_train_cleaned.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train_cleaned.iloc[train_index], y_train_cleaned.iloc[valid_index]


logreg_model = logreg.fit(X_train_fold, y_train_fold)

train_score = logreg_model.score(X_train_fold, y_train_fold)
valid_score = logreg_model.score(X_valid_fold, y_valid_fold)
train_results = []
valid_results = []
train_results.append(train_score)
valid_results.append(valid_score)

print(f'Average training score: {np.mean(train_results):.4f}')
print(f'Average validation score: {np.mean(valid_results):.4f}')

Average training score: 0.1250
Average validation score: 0.1073


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
train_results = []
valid_results = []

X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

for train_index, valid_index in skf.split(X_train_cleaned, y_train_cleaned):
    X_train_fold, X_valid_fold = X_train_cleaned.iloc[train_index], X_train_cleaned.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train_cleaned.iloc[train_index], y_train_cleaned.iloc[valid_index]

    num_classes = len(np.unique(y_train_fold))
    logreg_model = LogisticRegression(tol = 0.0000005)
    logreg_model = logreg.fit(X_train_fold, y_train_fold)
    train_score = logreg_model.score(X_train_fold, y_train_fold)
    valid_score = logreg_model.score(X_valid_fold, y_valid_fold)
    train_results.append(train_score)
    valid_results.append(valid_score)
    pred_v = logreg_model.predict_proba(X_valid_fold)

pred = logreg_model.predict_proba(X_train_cleaned)
log_loss_value = log_loss(y_train_cleaned, pred)
print(f"Log Loss: {log_loss_value}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [17]:
submission = pd.DataFrame({'id': id})
columns = [f'Target_{i}' for i in range(125)]
submission[columns] = 0.001
y_pred_proba = logreg_model.predict_proba(X_test)
for i, class_label in enumerate(logreg_model.classes_):
    submission['Target_' + str(class_label)] = y_pred_proba[:, i]

  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001
  submission[columns] = 0.001


In [19]:
logistic_sub = submission
logisticReg = y_pred_proba

X_train = train.copy()
X_test = test.copy()
X_train = X_train.drop('Target', axis=1)
y_train = train['Target']
isolation_forest = IsolationForest(contamination=0.1, random_state=42)
is_inlier = isolation_forest.fit_predict(X_train)

X_train_cleaned = X_train[is_inlier == 1]
y_train_cleaned = y_train[is_inlier == 1]

le = LabelEncoder()
y_train = le.fit_transform(y_train)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_cleaned , y_train_cleaned, test_size=0.2, random_state=42 )

In [20]:

catBoost = CatBoostClassifier(iterations=500,
                           learning_rate=0.1,
                           depth=6,
                           class_weights=[1] * num_classes,  # Use num_classes
                           random_seed=42)

catBoost_model = catBoost.fit(X_train_cleaned, y_train_cleaned)

0:	learn: 3.7408519	total: 538ms	remaining: 4m 28s
1:	learn: 3.6893664	total: 961ms	remaining: 3m 59s
2:	learn: 3.6464575	total: 1.38s	remaining: 3m 49s
3:	learn: 3.5944004	total: 1.8s	remaining: 3m 42s
4:	learn: 3.5528012	total: 2.26s	remaining: 3m 43s
5:	learn: 3.5146424	total: 2.74s	remaining: 3m 45s
6:	learn: 3.4776141	total: 3.12s	remaining: 3m 39s
7:	learn: 3.4427571	total: 3.37s	remaining: 3m 27s
8:	learn: 3.4080657	total: 3.64s	remaining: 3m 18s
9:	learn: 3.3769834	total: 3.89s	remaining: 3m 10s
10:	learn: 3.3467252	total: 4.15s	remaining: 3m 4s
11:	learn: 3.3159538	total: 4.39s	remaining: 2m 58s
12:	learn: 3.2880456	total: 4.7s	remaining: 2m 56s
13:	learn: 3.2611559	total: 4.96s	remaining: 2m 52s
14:	learn: 3.2374750	total: 5.21s	remaining: 2m 48s
15:	learn: 3.2142641	total: 5.49s	remaining: 2m 45s
16:	learn: 3.1906214	total: 5.75s	remaining: 2m 43s
17:	learn: 3.1689789	total: 6.01s	remaining: 2m 40s
18:	learn: 3.1506230	total: 6.25s	remaining: 2m 38s
19:	learn: 3.1294412	tota

In [21]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

for train_index, valid_index in skf.split(X_train_cleaned, y_train_cleaned):

    X_train_fold, X_valid_fold = X_train_cleaned.iloc[train_index], X_train_cleaned.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train_cleaned.iloc[train_index], y_train_cleaned.iloc[valid_index]



In [None]:
catBoost_model = catBoost.fit(X_train_fold, y_train_fold)

train_score = catBoost_model.score(X_train_fold, y_train_fold)
valid_score = catBoost_model.score(X_valid_fold, y_valid_fold)
train_results = []
valid_results = []
train_results.append(train_score)
valid_results.append(valid_score)

print(f'Average training score: {np.mean(train_results):.4f}')
print(f'Average validation score: {np.mean(valid_results):.4f}')

0:	learn: 3.7482767	total: 265ms	remaining: 2m 12s
1:	learn: 3.7004191	total: 509ms	remaining: 2m 6s
2:	learn: 3.6568182	total: 756ms	remaining: 2m 5s
3:	learn: 3.6077235	total: 1.02s	remaining: 2m 6s
4:	learn: 3.5636605	total: 1.39s	remaining: 2m 17s
5:	learn: 3.5233059	total: 1.79s	remaining: 2m 27s
6:	learn: 3.4872915	total: 2.24s	remaining: 2m 37s
7:	learn: 3.4516214	total: 2.65s	remaining: 2m 42s
8:	learn: 3.4155686	total: 3.04s	remaining: 2m 45s
9:	learn: 3.3841220	total: 3.4s	remaining: 2m 46s
10:	learn: 3.3527059	total: 3.83s	remaining: 2m 50s
11:	learn: 3.3245858	total: 4.28s	remaining: 2m 53s
12:	learn: 3.2952577	total: 4.58s	remaining: 2m 51s
13:	learn: 3.2670315	total: 4.83s	remaining: 2m 47s
14:	learn: 3.2405174	total: 5.07s	remaining: 2m 44s
15:	learn: 3.2168213	total: 5.33s	remaining: 2m 41s
16:	learn: 3.1920542	total: 5.58s	remaining: 2m 38s
17:	learn: 3.1701051	total: 5.82s	remaining: 2m 35s
18:	learn: 3.1469074	total: 6.06s	remaining: 2m 33s
19:	learn: 3.1240742	total

In [None]:
train_results = []
valid_results = []

X_train_cleaned = X_train_cleaned.reset_index(drop=True)
y_train_cleaned = y_train_cleaned.reset_index(drop=True)

for train_index, valid_index in skf.split(X_train_cleaned, y_train_cleaned):
    X_train_fold, X_valid_fold = X_train_cleaned.iloc[train_index], X_train_cleaned.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train_cleaned.iloc[train_index], y_train_cleaned.iloc[valid_index]

    num_classes = len(np.unique(y_train_fold))

    catBoost_model = CatBoostClassifier(iterations=500,
                           learning_rate=0.1,
                           depth=6,
                           class_weights=[1] * num_classes,
                           random_seed=42)

    catBoost_model = catBoost_model.fit(X_train_fold, y_train_fold)

    train_score = catBoost_model.score(X_train_fold, y_train_fold)
    valid_score = catBoost_model.score(X_valid_fold, y_valid_fold)

    train_results.append(train_score)
    valid_results.append(valid_score)

    pred_v = catBoost_model.predict_proba(X_valid_fold)

In [None]:
all_labels = np.unique(y_train_cleaned)

log_loss_value = log_loss(y_valid_fold, pred_v, labels=all_labels)
print(f"Log Loss: {log_loss_value}")
# Log Loss: 3.4167199472665493
# Log Loss: 3.4607471766892473 = intercept_scaling=5,max_iter=1000
# Log Loss: 3.4607471766892473 = intercept_scaling=1,max_iter=1000
# log loss : LogisticRegression(tol = 0.003) : Log Loss: 3.361522194282241
# log loss : LogisticRegression(tol = 0.00001) : Log Loss: 3.4020034701587254
# log loss : LogisticRegression(tol = 0.0000001) : Log Loss: 3.3888398096979957
# log loss : LogisticRegression(tol = 0.0000004) : Log Loss: 3.37370679509091
# log loss : LogisticRegression(tol = 0.0000005) : Log Loss: 3.3721897021216805
# log loss : LogisticRegression(tol = 0.0000005,max_iter=500) : Log Loss: 3.3416681438764555

In [None]:
pred = catBoost_model.predict_proba(X_train_cleaned)
catBoost_value = log_loss(y_train_cleaned, pred)
print(f"Log Loss: {log_loss_value}")

cat_submission = pd.DataFrame({'id': id})
columns = [f'Target_{i}' for i in range(125)]
cat_submission[columns] = 0.001


y_pred_proba_cat = catBoost_model.predict_proba(X_test)

for i, class_label in enumerate(catBoost_model.classes_):
    cat_submission['Target_' + str(class_label)] = y_pred_proba_cat[:, i]

catBoost_sub = cat_submission
y_pred_proba = y_pred_proba_cat
final = (logistic_sub.drop(columns = ['id']) + catBoost_sub.drop(columns = ['id'])) / 2
final.insert(0, 'id', logistic_sub['id'])
final.to_csv('../submissions/submission_part3.csv', index=False)