### Import required libraries

In [None]:
import os
import gc
import cv2
import math
import time
import shutil
import random
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

sns.set()
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

In [None]:
model_dict = \
    {
     'model_1': {
        'name': 'Standard LogReg',
        'pl': Pipeline([('scaling', StandardScaler()), 
                        ('log-reg', LogisticRegression(random_state=42, max_iter=1000))])
        },
    'model_2': {
        'name': 'LogReg',
        'pl': LogisticRegression(random_state=42, max_iter=1000)
        },
    'model_3': {
        'name': 'Standard SVC',
        'pl': Pipeline([('scaling', StandardScaler()),
                        ('svr', SVC(probability=True, random_state=42))]) 
        },
    'model_4': {
        'name': 'Random Forest',
        'pl': RandomForestClassifier(random_state=42)
        },
    'model_5': {
        'name': 'CB',
        'pl': CatBoostClassifier(random_seed=42, verbose=0)
        },
     'model_6': {
        'name': 'XGB',
        'pl': XGBClassifier(random_state=42, eval_metric='mlogloss')
        },
    'model_7': {
        'name': 'LGB',
        'pl': LGBMClassifier(random_state=42)
        }
    }

## Majority voting

In [None]:
INT_TO_LABEL = {
    0: "buildings",
    1: "forest",
    2: "glacier",
    3: "mountain",
    4: "sea",
    5: "street"
}

In [None]:
swin = pd.read_csv('../input/landscape-classification-ensembledata/submission_swin_094423.csv')
conv = pd.read_csv('../input/landscape-classification-ensembledata/submission_convnext_093930.csv')
vl =   pd.read_csv('../input/landscape-classification-ensembledata/submission_volod2_093966.csv')
reg =  pd.read_csv('../input/landscape-classification-ensembledata/submission_regnet_093857.csv')
eff =  pd.read_csv('../input/landscape-classification-ensembledata/submission_effnetb3_093107.csv')

print((swin['label'] == conv['label']).value_counts())
print((swin['label'] == vl['label']).value_counts())
print((swin['label'] == reg['label']).value_counts())
print((swin['label'] == eff['label']).value_counts())

df = swin.merge(conv, on='image')
df = df.merge(vl,  on='image')
df = df.merge(reg, on='image')
df = df.merge(eff, on='image')
df.head()

In [None]:
sub = pd.read_csv('../input/landscape-classification-ensembledata/submission_convnext_093930.csv')
sub['label'] = df.iloc[:, 1:].mode(axis=1)[0]

print((swin['label'] == sub['label']).value_counts())

sub.to_csv('submission_all_majority.csv', index=False)

sub.head()

## Stacking

In [None]:
l1_train = pd.read_csv('../input/landscape-classification-ensembledata/l1_train.csv')
l1_test = pd.read_csv('../input/landscape-classification-ensembledata/l1_test.csv')

y = l1_train['label']

features = l1_test.columns.to_list()[1:]

l1_train[features].head()

In [None]:
for _, value in model_dict.items():

    scores = []
    n_splits=10
    kf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)

    train_preds = np.zeros(shape=(len(l1_train.index), 6))
    test_preds = np.zeros(shape=(len(l1_test.index), 6))

    for i, (train_idx, test_idx) in enumerate(kf.split(l1_train, y)):

        x_train, x_val = l1_train[features].iloc[train_idx].copy(), l1_train[features].iloc[test_idx].copy()
        y_train, y_val = y.iloc[train_idx].copy(), y.iloc[test_idx].copy()
        
        model = value['pl']

        if value['name'] == 'CB' or value['name'] == 'XGB' or value['name'] == 'LGB':
            model.fit(x_train, y_train,
                      eval_set=[(x_val, y_val)],
                      early_stopping_rounds=200,
                      verbose=0)
        else:
            model.fit(x_train, y_train)
            
        train_oof_preds = model.predict_proba(x_val)
        train_preds[test_idx] = train_oof_preds

        train_oof_preds_2 = model.predict(x_val)
        score = accuracy_score(y_val, train_oof_preds_2)
        scores.append(score)

        print(f"{value['name']}: ACC = {score}")

        # if score >= 0.949:

        test_oof_preds = model.predict_proba(l1_test[features])
        test_preds += test_oof_preds / n_splits

    print(f"\n--> Overall metrics for {value['name']}")
    print(f": ACC = {np.array(scores).mean()} +/- {np.array(scores).std()}\n")

    np.save(f"{value['name']}_oof.npy", train_preds)
    np.save(f"{value['name']}_pred.npy", test_preds)

In [None]:
lr_preds = np.load('./LogReg_pred.npy')
cols = ["buildings", "forest", "glacier", "mountain", "sea", "street"]

sub = pd.DataFrame(lr_preds, columns=cols)
sub['image'] = l1_test['image']
sub['label'] = sub.iloc[:, :-1].idxmax(axis=1)
sub = sub[['image', 'label']]

print((conv['label'] == sub['label']).value_counts())

sub.to_csv("submission_lr_all_stacking.csv", index=False)

sub.head()