In [32]:
# Importing the required libraries
import numpy as np 
import pandas as pd 
from sklearn import preprocessing,model_selection,metrics
from matplotlib import pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Loading the dataset

In [2]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-may-2022/test.csv")

### Understanding the training data 

In [3]:
train_df.head()

In [4]:
train_df.info()

In [5]:
train_df.describe()

In [6]:
plt.figure(figsize=(8,8))
sns.countplot(x=train_df.target)

### The dataset is balanced. Checking if there are any missing values.

In [7]:
train_df.isnull().sum()

### There are no missing values in the dataset.

#### Plotting a histogram for each numerical column

In [27]:
numerical_columns = [col for col in train_df.columns if train_df[col].dtypes!='object' and col not in ('id','target')]
cat_column = [col for col in train_df.columns if train_df[col].dtypes=='object']

In [28]:
len(numerical_columns),len(cat_column)

In [29]:
train_df[numerical_columns].hist(figsize=(25,25))

## Here are the some observations from the above plot:<br> 
*     ### It can be noticed that the numerical features are not uniformly scaled.
     * #### <b>For instance, "f_28" ranges between -1000 to 1000 whereas many other features lie either between -4 to 4 or -10 to 10.</b>
     * #### <b>Also, features "f_07" to "f_18" are left skewed while other features appear to be normally distributed.</b>
*    ### The next steps involve scaling some features and tending to the skewed features.

### Before proceeding further, it is also important to check the outliers and choose the appropriate technique for scaling. Some methods are sensitive to outliers while others remain unaffected.

In [31]:
plt.figure(figsize=(20,20))
sns.boxplot(x='variable',y='value',data=train_df[numerical_columns].melt())

### From the boxplot it is evident that "f_28" contains a large amount of outliers. There are two options:<br>
#### 1. To use a scaling technique, such as robust scaler or quantum transform scaler, which is insensitive to outliers and directly scale the data. Or,  
#### 2. To remove outliers and then standardize the data. 

In [None]:
train_df[cat_column].value_counts()

In [None]:
train_df["kfold"] = -1
train_df = train_df.sample(frac=1).reset_index(drop=True)
X,y = train_df.drop('target',axis=1),train_df.target

In [None]:
X.shape,y.shape

### Performing Stratified K-Fold validation with 10 splits

In [None]:
kfold = model_selection.StratifiedKFold(n_splits=10,shuffle=True)

In [None]:
for fold,(train_index,valid_index) in enumerate(kfold.split(X=X,y=y)):
    train_df.loc[valid_index,'kfold'] = fold
train_df.to_csv("train_folds.csv",index=False)

In [None]:
useful_cols = [col for col in train_df.columns if col not in ('id','kfold','f_27','target')]
useful_cols

In [None]:
df_folds = pd.read_csv("./train_folds.csv")

#### Building the baseline model using each fold created.

In [None]:
test_predictions = []
def run_fold(fold,model):
    # Training the model with the newly created csv file
    print(f'Processing fold:{fold}')
    df_train = df_folds[df_folds.kfold!=fold].reset_index(drop=True)
    df_validation = df_folds[df_folds.kfold==fold].reset_index(drop=True)
    
    X_train,Y_train = df_train.drop(['target'],axis=1),df_train.target
    X_valid,Y_valid = df_validation.drop(['target'],axis=1),df_validation.target
    
    X_train,X_valid = X_train[useful_cols],X_valid[useful_cols]
    X_test = test_df.copy()
    X_test = X_test[useful_cols]
    
    # Building the model 
    if model=='XGBoost':
        clf = XGBClassifier()
    elif model=='LightGBM':
        clf = LGBMClassifier()
    
    clf.fit(X_train,Y_train)
    Y_pred_valid = clf.predict(X_valid)
    print(f'Accuracy on validation data: {metrics.accuracy_score(Y_valid,Y_pred_valid)}')
    Y_pred_test = clf.predict(X_test)
    test_predictions.append(Y_pred_test)

In [None]:
print('Building an XGB classifier model')
for index in range(10):
    run_fold(index,'XGBoost')

In [None]:
xgb_predictions = test_predictions

In [None]:
print('Building a LGBM classifier model')
for index in range(10):
    run_fold(index,'LightGBM')

In [None]:
lgbm_predictions = test_predictions

#### The XGBoost model has performed better than the LGBM model. 

In [None]:
final_predictions = np.mean(np.column_stack(xgb_predictions),axis=1).tolist()

In [None]:
submissions_df = pd.read_csv("../input/tabular-playground-series-may-2022/sample_submission.csv")
submissions_df.target = final_predictions

In [None]:
submissions_df.to_csv("submission1.csv",index=False)