<div class="alert alert-success">  
    <h1 align="center" style="color:darkcyan;">Explore Multi-Label Classification with an Enzyme Substrate Dataset</h1>  
    <h1 align="center" style="color:darkred;">Via GaussianNB</h1>  
    <h3 align="center" style="color:gray;">Playground Series - Season 3, Episode 18</h3>    
</div>

<div class="alert alert-success">  
</div>

In [None]:
import warnings # suppress warnings
warnings.filterwarnings('ignore')
#########################################
import os
import gc
import glob
import random
import numpy as np 
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from scipy import stats
from pathlib import Path
#########################################
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px
%matplotlib inline
!ls ../input/*
#########################################
pd.set_option('display.max_columns', 500)

In [None]:
df_train  = pd.read_csv('../input/playground-series-s3e18/train.csv', index_col='id')
df_test   = pd.read_csv('../input/playground-series-s3e18/test.csv', index_col='id')
df_sample = pd.read_csv('../input/playground-series-s3e18/sample_submission.csv')

# df_train.describe().transpose()
# display(df_train , df_test, df_sample)
df_train.shape , df_test.shape, df_sample.shape

## <span style="color:darkred;">Original Data & Concatenate</span>

In [None]:
original = pd.read_csv('/kaggle/input/ec-mixed-class/mixed_desc.csv')
print(original.shape)

original['EC1'] = [int(f[0])  for f in original['EC1_EC2_EC3_EC4_EC5_EC6']]
original['EC2'] = [int(f[2])  for f in original['EC1_EC2_EC3_EC4_EC5_EC6']] 
original['EC3'] = [int(f[4])  for f in original['EC1_EC2_EC3_EC4_EC5_EC6']]
original['EC4'] = [int(f[6])  for f in original['EC1_EC2_EC3_EC4_EC5_EC6']]
original['EC5'] = [int(f[8])  for f in original['EC1_EC2_EC3_EC4_EC5_EC6']]
original['EC6'] = [int(f[10]) for f in original['EC1_EC2_EC3_EC4_EC5_EC6']]
original.shape

In [None]:
df_org = original[df_train.columns]
df_org.shape

In [None]:
df_train = pd.concat([df_train, df_org], axis = 0).reset_index(drop=True)
df_train.shape

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Check Null Values</span>

In [None]:
MV1 = df_train.isnull().sum()
MV2 = df_test.isnull().sum()

print(':' * 25)
print(f'Missing Value df_train:\n{MV1[MV1 > 0]}')
print(':' * 25)
print(f'Missing Value df_test:\n{MV2[MV2 > 0]}')
print(':' * 25)

## <span style="color:darkred;">Check Duplicates</span>

In [None]:
df_train.duplicated().value_counts()

In [None]:
df_test.duplicated().value_counts()

In [None]:
df_train = df_train.drop_duplicates()
df_train.shape

## <span style="color:darkred;">Train | Test | Target</span>

In [None]:
col = [c for c in df_train.columns.tolist() if c not in df_test.columns.tolist()]
col 

In [None]:
df_sample.columns.tolist()

In [None]:
test = df_test.copy()
train = df_train.drop(columns=col)
target = df_train[['EC1','EC2']]

train.shape , test.shape, target.shape

## <span style="color:darkred;">Target graph</span>

In [None]:
target.value_counts().plot(kind='barh', figsize=(12,2), title='Target Count', color=['darkcyan','red'])
plt.gca().set_facecolor('lightcyan')

pd.DataFrame(data= {'Number': target.value_counts(), 'Percent': target.value_counts(normalize=True)})

## <span style="color:darkred;">Target graph (Only Original)</span>

In [None]:
target_org = df_org[['EC1','EC2']]

target_org.value_counts().plot(kind='barh', figsize=(12,2), title='Target Count', color=['darkcyan','red'])
plt.gca().set_facecolor('lightyellow')

pd.DataFrame(data= {'Number': target_org.value_counts(), 'Percent': target_org.value_counts(normalize=True)})

#### So synthetic data and original data are almost similar.

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">All Features</span>

In [None]:
features = train.columns.tolist()
len(features)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
pd.DataFrame(data= {'Train': train.nunique(), 'Test': test.nunique()})

## <span style="color:darkred;">Categorical Features</span>

In [None]:
cat_features = [f for f in features if train[f].dtype==int]
cat_features

In [None]:
s_train1 = set(train['NumHeteroatoms'])
s_test1 = set(test['NumHeteroatoms'])
up1 = list(set(s_train1.union(s_test1)))

print(len(s_train1) == len(up1))
print(len(up1))

In [None]:
s_train2 = set(train['fr_COO'])
s_test2 = set(test['fr_COO'])
up2 = list(set(s_train2.union(s_test2)))

print(len(s_train2) == len(up2))
print(len(up2))

In [None]:
s_train3 = set(train['fr_COO2'])
s_test3 = set(test['fr_COO2'])
up3 = list(set(s_train3.union(s_test3)))

print(len(s_train3) == len(up3))
print(len(up3))

#### So there is a problem in all three "Categorical Features". We will solve this problem when we do "OneHotEncoder".

## <span style="color:darkred;">Numerical Features</span>

In [None]:
num_features = [f for f in features if f not in cat_features]
len(num_features)

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Histograms of the Features</span>

In [None]:
sns.set()
plt.style.use('seaborn-whitegrid') 
_, axs = plt.subplots(11, 3, figsize=(15,45), facecolor='lightblue')

for f, ax in zip(features, axs.ravel()):
    ax.set_facecolor('lightcyan')
    ax.hist(train[f], bins=30, color='red')
    ax.set_title(f'Feature: {f}', fontsize=10)

plt.suptitle('Histograms of the Features', y=0.90, fontsize=32, color='lightcyan')
plt.show()

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Correlation Matrix</span>

In [None]:
cor_matrix = df_train[features + ['EC1','EC2']].corr()
fig = plt.figure(figsize=(12,12));

cmap=sns.diverging_palette(240, 10, s=75, l=50, sep=1, n=6, center='light', as_cmap=False);
sns.heatmap(cor_matrix, center=0, annot=False, cmap=cmap, linewidths=2);
plt.show()

In [None]:
corr = df_train[features + ['EC1','EC2']].corr(numeric_only=True)
corr.style.background_gradient(cmap='Reds')

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Convert Categorical Features (OneHotEncoder)</span>

In [None]:
train_code = pd.get_dummies(train, columns=cat_features)
test_code = pd.get_dummies(test, columns=cat_features)

train_code.shape, test_code.shape

## <span style="color:darkred;">Overlap for features</span>

In [None]:
f_train = train_code.columns.tolist()
f_test = test_code.columns.tolist()

f_overlap = [f for f in f_test if f in f_train]

train_code = train_code[f_overlap]
test_code = test_code[f_overlap]

train_code.shape, test_code.shape

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Let's scale the numerical features</span>

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# scaler = StandardScaler()

# train_code[num_features] = scaler.fit_transform(train_code[num_features])
# test_code[num_features] = scaler.fit_transform(test_code[num_features])

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Evaluation Metric (AUC)</span>

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

def roc_auc(true_list, pred_list, a, b):
    
    fpr, tpr, _ = roc_curve(true_list, pred_list)    
    roc_auc = auc(fpr, tpr)
    
    print(f'\n>>>>> ROC_AUC: %0.6f <<<<<\n' %roc_auc)
    
    sns.set()
    plt.style.use('seaborn-whitegrid')
    plt.figure(figsize=(a, b), facecolor='lightblue')
    plt.gca().set_facecolor('lightcyan')
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([-0.01, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('The area under the ROC curve\n')
    plt.legend(loc="lower right")
    plt.show()

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Gaussian Naive Bayes (GaussianNB)</span>

## <span style="color:darkred;">||||||||||||||||||||||||||||||||||||||||||||||||||||||||||</span>

In [None]:
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import QuantileTransformer

## <span style="color:darkred;">Prediction (EC1)</span>

In [None]:
transformed = pd.DataFrame(QuantileTransformer(output_distribution='normal').fit_transform(train_code))

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())

pipeline.fit(train_code, target['EC1'])

In [None]:
cross_val_score(pipeline, train_code, target['EC1'], scoring='roc_auc' , cv=10).mean()

In [None]:
preds_bayes1 = pipeline.predict_proba(test_code)[:,1]
preds_bayes1

## <span style="color:darkred;">Prediction Histogram (EC1)</span>

In [None]:
sns.set()
plt.hist(preds_bayes1, bins=20)
plt.gca().set_facecolor('lightblue')
min(preds_bayes1), max(preds_bayes1)

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Prediction (EC2)</span>

In [None]:
transformed = pd.DataFrame(QuantileTransformer(output_distribution='normal').fit_transform(train_code))

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())

pipeline.fit(train_code, target['EC2'])

In [None]:
cross_val_score(pipeline, train_code, target['EC2'], scoring='roc_auc' , cv=7).mean()

In [None]:
preds_bayes2 = pipeline.predict_proba(test_code)[:,1]
preds_bayes2

## <span style="color:darkred;">Prediction Histogram (EC2)</span>

In [None]:
sns.set()
plt.hist(preds_bayes2, bins=20)
plt.gca().set_facecolor('lightblue')
min(preds_bayes2), max(preds_bayes2)

<div class="alert alert-success">  
</div>

## <span style="color:darkred;">Submission - BAYES </span>

In [None]:
sub_bayes = df_sample.copy()
sub_bayes['EC1'] = preds_bayes1
sub_bayes['EC2'] = preds_bayes2
sub_bayes

In [None]:
sub_bayes.to_csv('submission_bayes.csv',index=False)
!ls

<div class="alert alert-success">  
</div>


## <span style="color:darkred;">Ensembling</span>

Thanks to: **@tetsutani**  
https://www.kaggle.com/code/tetsutani/ps3e18-eda-ensemble-ml-pipeline-binarypredictict/notebook

Thanks to: **@onurkoc83**  
https://www.kaggle.com/code/onurkoc83/overfit-champion

Thanks to: **@meisa0**  
https://www.kaggle.com/code/meisa0/s3e18-target-encoding-lb-0-65947

In [None]:
sub_import1 = pd.read_csv('../input/ps3e18s65540/submission.csv')
sub_import2 = pd.read_csv('../input/ps3e18s65810/submission.csv')
sub_import3 = pd.read_csv('../input/ps3e18s65947/submission.csv')

In [None]:
sub = df_sample.copy()
sub['EC1'] = (sub_import3['EC1'] * 1.00) 
sub['EC2'] = (sub_import3['EC2'] * 0.50) + (sub_bayes['EC2'] * 0.10) + (sub_import1['EC2'] * 0.40) 
sub

In [None]:
sub.to_csv('submission.csv',index=False)
!ls

<div class="alert alert-success">  
    <h1 align="center" style="color:darkred;">Target - Four Classes</h1>    
</div>

### If the results and predictions for all columns were almost equal, we could even turn the challenge into a four-class challenge. But in this challenge, this method does not have good results. But I will continue to implement this view.

<div class="alert alert-success">  
</div>

In [None]:
target1 = target.copy()
ec1_ec2 = []

for n in range(len(target1)):
    if (target1['EC1'].iloc[n]==0 and target1['EC2'].iloc[n]==0):
        ec1_ec2.append(0)
    if (target1['EC1'].iloc[n]==0 and target1['EC2'].iloc[n]==1):
        ec1_ec2.append(1)       
    if (target1['EC1'].iloc[n]==1 and target1['EC2'].iloc[n]==0):
        ec1_ec2.append(2)    
    if (target1['EC1'].iloc[n]==1 and target1['EC2'].iloc[n]==1):
        ec1_ec2.append(3)  
        
target1['EC1_EC2'] = ec1_ec2
target1 = target1.drop(columns=['EC1','EC2'])
target1.nunique(), len(target1)

In [None]:
target1.value_counts().plot(kind='barh', figsize=(12,2), title='Target Count', color=['darkcyan','red'])
plt.gca().set_facecolor('lightgreen')

pd.DataFrame(data= {'Number': target1.value_counts(), 'Percent': target1.value_counts(normalize=True)})

In [None]:
transformed = pd.DataFrame(QuantileTransformer(output_distribution='normal').fit_transform(train_code))

pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), GaussianNB())

pipeline.fit(train_code, target1)

In [None]:
cross_val_score(pipeline, train_code, target1, scoring='roc_auc_ovr_weighted' , cv=7).mean()

In [None]:
preds_bayes_4c = pipeline.predict(test_code)
preds_bayes_4c

## <span style="color:darkred;">Prediction Histogram (4 Classes)</span>

In [None]:
sns.set()
plt.hist(preds_bayes_4c, bins=20)
plt.gca().set_facecolor('lightblue')
min(preds_bayes_4c), max(preds_bayes_4c)

## <span style="color:darkred;">Submission - BAYES (4 Classes)</span>

In [None]:
sub_bayes_4c = df_sample.copy()
for n in range(len(preds_bayes_4c)):
    
    if (preds_bayes_4c[n] == 0):
        sub_bayes_4c['EC1'].iloc[n] = 0
        sub_bayes_4c['EC2'].iloc[n] = 0
    if (preds_bayes_4c[n] == 1):
        sub_bayes_4c['EC1'].iloc[n] = 0
        sub_bayes_4c['EC2'].iloc[n] = 1       
    if (preds_bayes_4c[n] == 2):
        sub_bayes_4c['EC1'].iloc[n] = 1
        sub_bayes_4c['EC2'].iloc[n] = 0   
    if (preds_bayes_4c[n] == 3):
        sub_bayes_4c['EC1'].iloc[n] = 1
        sub_bayes_4c['EC2'].iloc[n] = 1 
        
sub_bayes_4c

In [None]:
sub_bayes_4c.to_csv('submission_bayes_4c.csv',index=False)
!ls

<div class="alert alert-success">  
</div>

<div class="alert alert-success">  
</div>