In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.set_option("display.max_columns",100)
warnings.filterwarnings("ignore")
sns.set_style(style = "darkgrid")

%matplotlib inline

In [None]:
data = pd.read_csv("/kaggle/input/playground-series-s4e6/train.csv")

In [None]:
data.sample(5)

In [None]:
def insights(data):
    print("Columns :",data.columns,'\n',
         "Total :",len(data.columns))
    print("Null values :-",data.isna().sum(),'\n')
    print("Total null values :",data.isna().sum().sum(),'\n')
    categorical_col = data.select_dtypes(exclude = ['int','float']).columns
    numerical_col = data.select_dtypes(include = ['int','float']).columns
    print("Categorical columns :",categorical_col,'\n')
    print("Numerical columns :",numerical_col,'\n')
    for col in data.columns:
        print(f"percentage of null values in {col} :",\
              (data[col].isna().sum() / data.isna().sum().sum()) * 100,'\n')
        if col in numerical_col:
            print("mean :",np.mean(data[col]).round(2),'\n',\
                  "median :",np.median(data[col]),'\n',\
                  "mode :",data[col].mode()[0],'\n')
        if col in categorical_col:
            print("mode :",data[col].mode()[0],'\n')
            print(f"Unique values in {col} :", data[col].unique(), "count :",len(data[col].unique()))
        print(f"Datatype of {col} :",data[col].dtype,'\n')

In [None]:
insights(data)

In [None]:
numerical_col = data.select_dtypes(include = ['int','float']).columns
categorical_col = data.select_dtypes(exclude = ['int','float']).columns

In [None]:
numerical_col

In [None]:
categorical_col

In [None]:
df = data.copy()

In [None]:
corr_matrix = df[numerical_col].corr()

In [None]:
plt.title("Correlation Matrix",loc = 'center')
sns.heatmap(corr_matrix , annot = True, cmap = 'Reds')
plt.figure(figsize = [6,6])
plt.show()

In [None]:
corr_matrix

In [None]:
df.sample(5)

In [None]:
df = df.drop(['id'],axis = 1)

In [None]:
bool_col = []
def bool_unique(data):
    for col in data.columns:
        if(len(data[col].unique()) < 3):
            print(col , "values :",data[col].unique())
            bool_col.append(col)

In [None]:
bool_unique(df)

In [None]:
bool_col

In [None]:
numerics = []

for col in numerical_col:
    if col not in bool_col:
        numerics.append(col)

In [None]:
numerics

In [None]:
numerics.remove('id')

In [None]:
df.Target.unique()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [None]:
for col in numerics:
    plt.title(col,loc = 'center')
    ax1 = sns.boxplot(df[col])
    plt.tight_layout()
    plt.show()

In [None]:
for col in numerics:
    plt.title(col)
    plt.subplot(1,2,1)
    ax2 = sns.kdeplot(df[col],fill = True)
    plt.show()

In [None]:
def outliers(df,cols):
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1

        upper_limit = q3 + 1.5*(iqr)
        lower_limit = q3 - 1.5*(iqr)
        outliers = []
        
        for values in df[col]:
            if (values < lower_limit) | (values > upper_limit):
                outliers.append(values)
        print(f"Outliers in {col} :",len(outliers))
        print(f"Per of Outliers in {col}",(len(outliers) / len(df[col])*100.0))

In [None]:
outliers(df,numerics)

In [None]:
def remove_outliers(df,cols):
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1

        upper_limit = q3 + 1.5*(iqr)
        lower_limit = q3 - 1.5*(iqr)
        outliers = []
        
        df[col] = np.where(df[col] > upper_limit,upper_limit,df[col])
        df[col] = np.where(df[col] < lower_limit,lower_limit,df[col])

In [None]:
remove_outliers(df,numerics)

In [None]:
outliers(df,numerics)

In [None]:
df[numerics].sample()

In [None]:
df['Target'].value_counts()

In [None]:
for col in numerics:
    print(f"Skewness of {col} :",df[col].skew())

In [None]:
for col in bool_col:
    print(df[col].value_counts())

In [None]:
df.Target.unique()

In [None]:
categorical_col

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
sd = StandardScaler()
ohe = OneHotEncoder(sparse = False)

In [None]:
transformer = ColumnTransformer(transformers = [
    ("cate",ohe,categorical_col),
    ("num",sd,numerics),
], remainder = 'passthrough')

In [None]:
df_trans = transformer.fit_transform(df)

In [None]:
transformer.get_feature_names_out()

In [None]:
df1 = pd.DataFrame(df_trans, columns= transformer.get_feature_names_out(df.columns))

In [None]:
len(df1.columns)

In [None]:
len(df.columns)

In [None]:
df1.head()

In [None]:
df1 = df1.drop(['cate__Target_Dropout','cate__Target_Enrolled','cate__Target_Graduate'],axis = 1)

In [None]:
X = df1.copy()

In [None]:
y = df['Target']

In [None]:
y.unique()

In [None]:
y = y.map({
    'Graduate':0,
    'Dropout':1,
    'Enrolled':2
})

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import metrics
from xgboost import XGBClassifier
from skopt import BayesSearchCV

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,random_state = 42,test_size = 0.2)

In [None]:
x_train.shape

In [None]:
y.shape

In [None]:
def fit_model(model):
    x_train,x_test,y_train,y_test = train_test_split(X,y,random_state = 42,test_size = 0.2)
    model.fit(x_train,y_train)
    
    predicted = model.predict(x_test)
    
    return predicted

In [None]:
rfc = RandomForestClassifier(n_jobs = -1)
lr = LogisticRegression(multi_class = 'multinomial')
xgb = XGBClassifier()
svc = SVC()

In [None]:
pred1 = fit_model(rfc)

In [None]:
pred2 = fit_model(lr)

In [None]:
pred4 = fit_model(xgb)

In [None]:
pred5 = fit_model(svc)

In [None]:
print(f"accuracy score for rfc : {metrics.accuracy_score(y_test,pred1)}" , "f1 score :",metrics.f1_score(y_test,pred1,average = 'micro'))
print(f"accuracy score for lr : {metrics.accuracy_score(y_test,pred2)}","f1 score :",metrics.f1_score(y_test,pred2,average = 'micro'))
print(f"accuracy score for xgb : {metrics.accuracy_score(y_test,pred4)}","f1 score :",metrics.f1_score(y_test,pred4,average = 'micro'))
# print(f"accuracy score for svc : {metrics.accuracy_score(y_test,pred5)}","f1 score :",metrics.f1_score(y_test,pred5,average = 'micro'))

In [None]:
rfc_cv = RandomForestClassifier()

In [None]:
param_space = {
    'n_estimators': [10, 500],
    'max_depth': [1, 50],
    'min_samples_split': [2, 20],
    'min_samples_leaf': [1, 20],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

In [None]:
bayes_search = BayesSearchCV(
    estimator=rfc_cv,
    search_spaces=param_space,
    n_iter=32,  # Number of iterations for optimization
    cv=5,  # Cross-validation splits
    random_state=42,
    n_jobs=-1  # Use all available cores
)

In [None]:
bayes_search.fit(x_train,y_train)

In [None]:
best_rfc = bayes_search.best_estimator_

In [None]:
bayes_search.best_params_

In [None]:
import pickle

with open('model_rfc.pkl', 'wb') as file:
    pickle.dump(best_rfc, file)

In [None]:
print(f"accuracy score for rfc_bscv : {metrics.accuracy_score(y_test,best_rfc.predict(x_test))}","f1 score :",metrics.f1_score(y_test,best_rfc.predict(x_test),average = 'micro'))

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components = None)

pca.fit_transform(x_train)

In [None]:
pca.explained_variance_ratio_

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.show()

In [None]:
pca_13 = PCA(n_components = 13)

x_train_13 = pca_13.fit_transform(x_train)
x_test_13 = pca_13.fit_transform(x_test)

In [None]:
pca_25 = PCA(n_components = 25)

x_train_25 = pca_25.fit_transform(x_train)
x_test_25 = pca_25.fit_transform(x_test)

In [None]:
x_train_25.shape

In [None]:
xgb13 = xgb.XGBClassifier()

xgb13.fit(x_train_13,y_train)

output = xgb13.predict(x_test_13)

metrics.accuracy_score(output , y_test)

In [None]:
rfc13 = RandomForestClassifier(n_jobs = -1)

rfc13.fit(x_train_13,y_train)

output = rfc13.predict(x_test_13)

metrics.accuracy_score(output , y_test)

In [None]:
rfc25 = RandomForestClassifier(n_jobs = -1)

rfc25.fit(x_train_25,y_train)

output = rfc25.predict(x_test_25)

metrics.accuracy_score(output , y_test)

In [None]:
xgb25 = xgb.XGBClassifier()

xgb25.fit(x_train_25,y_train)

output = xgb25.predict(x_test_25)

metrics.accuracy_score(output , y_test)

In [None]:
cv = cross_val_score(rfc25,x_train_25,y_train, cv=5, scoring='accuracy')

In [None]:
print(f"Average accuracy: {cv.mean():.4f}")

**By far best model is XGB with all features and no tuned hyperparameter**

# Submission prep

In [None]:
test = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [None]:
test.head()

In [None]:
Id = test['id']
test = test.drop(['id'],axis = 1)

In [None]:
numerics

In [None]:
outliers(test,numerics)

In [None]:
remove_outliers(test,numerics)

In [None]:
outliers(test,numerics)

In [None]:
transformer_test = ColumnTransformer(transformers = [
    ("num",sd,numerics),
], remainder = 'passthrough')

df_trans = transformer_test.fit_transform(test)

test_trans = pd.DataFrame(df_trans, columns= transformer_test.get_feature_names_out(test.columns))

In [None]:
test_trans.head()

In [None]:
sub_pred = best_rfc.predict(test_trans)

In [None]:
sub_pred

In [None]:
predicted1 = pd.DataFrame(index = None).reset_index(drop = True)

In [None]:
predicted1.head()

In [None]:
predicted1['id'] = Id

In [None]:
predicted1['Target'] = sub_pred

In [None]:
predicted1['Target'] = predicted1['Target'].map({
    1 : 'Dropout',
    0 : 'Graduate',
    2 : 'Enrolled'
})

In [None]:
predicted1.head()

In [None]:
predicted.shape

In [None]:
test.shape

In [None]:
predicted1 = predicted1.reset_index(drop=True)

In [None]:
predicted1.to_csv('submission1.csv')