<h1 style="border:orange; border-width:3px; border-style:solid;padding:20px;text-align:center;border-radius:50px"> 🧪 Enzyme Multi Label Classification 🧬</h1>  

# Importing Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv(r'/kaggle/input/playground-series-s3e18/train.csv')
df.head()

**Dropping unnecessary features like EC3,EC4,EC5,EC6,id**

In [None]:
df.drop(['EC3','EC4','EC6','EC5','id'],axis=1,inplace=True)

## Checking for null values

In [None]:
df.isnull().sum()

# Correlation Matrix

In [None]:
corr=df.corr()
corr

In [None]:
plt.figure(figsize=(30,30))
sns.heatmap(corr,annot=True)
plt.show

# Mutual Info

## Mutual info between the features and both the target variables

In [None]:
from sklearn.feature_selection import mutual_info_classif,SelectKBest
dic={}
for i in ['EC1','EC2']:
    mutual_info=mutual_info_classif(df.drop([i],axis=1),df[i])
    mutual_info=pd.Series(mutual_info)
    mutual_info.index=df.drop([i],axis=1).columns
    columns=mutual_info.sort_values(ascending=False)
    columns.plot.bar(title=i,figsize=(20,8))
    plt.show()
    select_cols=SelectKBest(mutual_info_classif,k=10)
    select_cols.fit(df.drop([i],axis=1),df[i])
    dic[i]=df.drop([i],axis=1).columns[select_cols.get_support()]

**I used the above plots to take 10 features that are the most relevant for predicting the value of EC1**

As it can be seen EC1 is an important feature for predicting EC2. So, I decided to use the predicted value of EC1 in test dataset as a feature of predicting EC2. 

This helped me improve my score.

In [None]:
dic={'EC1': ['BertzCT', 'EState_VSA1', 'ExactMolWt', 'HeavyAtomMolWt',
        'MinEStateIndex', 'NumHeteroatoms', 'PEOE_VSA14', 'SMR_VSA10',
        'SMR_VSA5', 'SlogP_VSA3'],
 'EC2': ['BertzCT', 'Chi1', 'Chi2n', 'ExactMolWt', 'FpDensityMorgan3',
        'MinEStateIndex', 'PEOE_VSA14', 'SMR_VSA10', 'fr_COO', 'EC1']}

In [None]:
x1=df[dic['EC1']]
x2=df[dic['EC2']]
y1=df[['EC1']]
y2=df[['EC2']]

In [None]:
from sklearn.preprocessing import StandardScaler
sc_x1=StandardScaler()
sc_x2=StandardScaler()
x1=pd.DataFrame(sc_x1.fit_transform(x1),columns=dic['EC1'])
x2=pd.DataFrame(sc_x2.fit_transform(x2),columns=dic['EC2'])

In [None]:
x2

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
x1_train,x1_test,y1_train,y1_test=train_test_split(x1,y1,test_size=0.25,random_state=1)
x2_train,x2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.25,random_state=1)

In [None]:
x1_train

# Model Selection

## Using Simple Models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
DTC1=DecisionTreeClassifier(random_state=0)
LR1=LogisticRegression()
RFC1=RandomForestClassifier(n_estimators=23,random_state=0)
KNN1=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
NB1=GaussianNB()

In [None]:
DTC1.fit(x1_train,y1_train)
LR1.fit(x1_train,y1_train)
RFC1.fit(x1_train,y1_train)
KNN1.fit(x1_train,y1_train)
NB1.fit(x1_train,y1_train)

In [None]:
pred_DTC1=DTC1.predict_proba(x1_test)
pred_DTC1=[i[1] for i in pred_DTC1]
pred_LR1=LR1.predict_proba(x1_test)
pred_LR1=[i[1] for i in pred_LR1]
pred_RFC1=RFC1.predict_proba(x1_test)
pred_RFC1=[i[1] for i in pred_RFC1]
pred_KNN1=KNN1.predict_proba(x1_test)
pred_KNN1=[i[1] for i in pred_KNN1]
pred_NB1=NB1.predict_proba(x1_test)
pred_NB1=[i[1] for i in pred_NB1]

In [None]:
print("Decision Tree Classification ROC score =",roc_auc_score(y1_test,pred_DTC1))
print("****************************************************************")
print("Logistic Regression ROC score =",roc_auc_score(y1_test,pred_LR1))
print("****************************************************************")
print("Random Forest Classification ROC score =",roc_auc_score(y1_test,pred_RFC1))
print("****************************************************************")
print("K Nearest Neighbors ROC score =",roc_auc_score(y1_test,pred_KNN1))
print("****************************************************************")
print("Naive Bayes ROC score =",roc_auc_score(y1_test,pred_NB1))

## Using Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GBC1=GradientBoostingClassifier()
GBC1.fit(x1_train,y1_train)

In [None]:
pred_GBC1=GBC1.predict_proba(x1_test)
pred_GBC1=[i[1] for i in pred_GBC1]

In [None]:
print("Gradient Boosting ROC score =",roc_auc_score(y1_test,pred_GBC1))

## Using XGBoost

In [None]:
from xgboost import XGBClassifier
XGB1=XGBClassifier()
XGB1.fit(x1_train,y1_train)

In [None]:
pred_XGB1=XGB1.predict_proba(x1_test)
pred_XGB1=[i[1] for i in pred_XGB1]

In [None]:
print("XGB ROC score =",roc_auc_score(y1_test,pred_XGB1))

# Using AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ABC1=AdaBoostClassifier()
ABC1.fit(x1_train,y1_train)

In [None]:
pred_ABC1=ABC1.predict_proba(x1_test)
pred_ABC1=[i[1] for i in pred_ABC1]

In [None]:
print("AdaBoost ROC score =",roc_auc_score(y1_test,pred_ABC1))

In [None]:
from catboost import CatBoostClassifier
CBC1=CatBoostClassifier()
CBC1.fit(x1_train,y1_train)

In [None]:
pred_CBC1=CBC1.predict_proba(x1_test)
pred_CBC1=[i[1] for i in pred_CBC1]

In [None]:
print("CatBoost ROC score =",roc_auc_score(y1_test,pred_CBC1))

In [None]:
from lightgbm import LGBMClassifier
LGB1=LGBMClassifier()
LGB1.fit(x1_train,y1_train)

In [None]:
pred_LGB1=LGB1.predict_proba(x1_test)
pred_LGB1=[i[1] for i in pred_LGB1]

In [None]:
print("LightGBM ROC score =",roc_auc_score(y1_test,pred_LGB1))

#### Repeating above steps for EC2

In [None]:
DTC2=DecisionTreeClassifier(random_state=0)
LR2=LogisticRegression()
RFC2=RandomForestClassifier(n_estimators=23,random_state=0)
KNN2=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
NB2=GaussianNB()

In [None]:
DTC2.fit(x2_train,y2_train)
LR2.fit(x2_train,y2_train)
RFC2.fit(x2_train,y2_train)
KNN2.fit(x2_train,y2_train)
NB2.fit(x2_train,y2_train)

In [None]:
pred_DTC2=DTC2.predict_proba(x2_test)
pred_DTC2=[i[1] for i in pred_DTC2]
pred_LR2=LR2.predict_proba(x2_test)
pred_LR2=[i[1] for i in pred_LR2]
pred_RFC2=RFC2.predict_proba(x2_test)
pred_RFC2=[i[1] for i in pred_RFC2]
pred_KNN2=KNN2.predict_proba(x2_test)
pred_KNN2=[i[1] for i in pred_KNN2]
pred_NB2=NB2.predict_proba(x2_test)
pred_NB2=[i[1] for i in pred_NB2]

In [None]:
print("Decision Tree Classification ROC score =",roc_auc_score(y2_test,pred_DTC2))
print("****************************************************************")
print("Logistic Regression ROC score =",roc_auc_score(y2_test,pred_LR2))
print("****************************************************************")
print("Random Forest Classification ROC score =",roc_auc_score(y2_test,pred_RFC2))
print("****************************************************************")
print("K Nearest Neighbors ROC score =",roc_auc_score(y2_test,pred_KNN2))
print("****************************************************************")
print("Naive Bayes ROC score =",roc_auc_score(y2_test,pred_NB2))

In [None]:
GBC2=GradientBoostingClassifier()
GBC2.fit(x2_train,y2_train)

In [None]:
pred_GBC2=GBC2.predict_proba(x2_test)
pred_GBC2=[i[1] for i in pred_GBC2]

In [None]:
print("Gradient Boosting ROC score =",roc_auc_score(y2_test,pred_GBC2))

In [None]:
XGB2=XGBClassifier()
XGB2.fit(x2_train,y2_train)

In [None]:
pred_XGB2=XGB2.predict_proba(x2_test)
pred_XGB2=[i[1] for i in pred_XGB2]

In [None]:
print("XGB ROC score =",roc_auc_score(y2_test,pred_XGB2))

In [None]:
ABC2=AdaBoostClassifier()
ABC2.fit(x2_train,y2_train)

In [None]:
pred_ABC2=ABC2.predict_proba(x2_test)
pred_ABC2=[i[1] for i in pred_ABC2]

In [None]:
print("AdaBoost ROC score =",roc_auc_score(y2_test,pred_ABC2))

In [None]:
CBC2=CatBoostClassifier()
CBC2.fit(x2_train,y2_train)

In [None]:
pred_CBC2=CBC2.predict_proba(x2_test)
pred_CBC2=[i[1] for i in pred_CBC2]

In [None]:
print("CatBoost ROC score =",roc_auc_score(y2_test,pred_CBC2))

In [None]:
LGB2=LGBMClassifier()
LGB2.fit(x2_train,y2_train)

In [None]:
pred_LGB2=LGB2.predict_proba(x2_test)
pred_LGB2=[i[1] for i in pred_LGB2]

In [None]:
print("LightGBM ROC score =",roc_auc_score(y2_test,pred_LGB2))

# Making Prediction

In [None]:
df_test=pd.read_csv(r'/kaggle/input/playground-series-s3e18/test.csv')
df_test.head()

In [None]:
id=df_test['id'].values

In [None]:
xtest1=df_test[dic['EC1']]

In [None]:
xtest1=pd.DataFrame(sc_x1.transform(xtest1),columns=dic['EC1'])

Adding predicted value of EC1 back to the test dataset for predicting EC2

In [None]:
df_test['EC1']=CBC1.predict(xtest1)
xtest2=df_test[dic['EC2']]
xtest2=pd.DataFrame(sc_x2.transform(xtest2),columns=dic['EC2'])

In [None]:
final_pred1=CBC1.predict_proba(xtest1)
final_pred1=[i[1] for i in final_pred1]
final_pred2=GBC2.predict_proba(xtest2)
final_pred2=[i[1] for i in final_pred2]

In [None]:
data={'id':id,'EC1':final_pred1,'EC2':final_pred2}

In [None]:
final_df=pd.DataFrame(data)

In [None]:
final_df.head()

In [None]:
final_df.to_csv('submission.csv',index=False)