<a href="https://www.kaggle.com/code/aneeshgrover/thapar-summer-school-competition-2?scriptVersionId=185625393" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Thapar Summer School 2024 Kaggle Competition

## Overview

1. Import Libraries
2. Data Understanding and Exploration
3. Data Preprocessing
4. Model Building
5. Model Evaluation
6. Generate Predictions for Submission


## Importing the Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Data Understanding and Exploration

In [None]:
train = pd.read_csv("/kaggle/input/thapar-summer-school-2024-competition-2/train.csv/train.csv")
test = pd.read_csv("/kaggle/input/thapar-summer-school-2024-competition-2/test.csv/test.csv")

In [None]:
train.head

In [None]:
print("Train Shape: ", train.shape)
print("Test Shape: ", test.shape)

print("\nMissing values in train: \n")
print(train.isnull().sum())
print("\nMissing values in test: \n")
print(test.isnull().sum())

print("Train Data Summary: ")
train.describe()


In [None]:
plt.figure(figsize = (8, 6))
sns.countplot(x = 'NObeyesdad', data = train)
plt.title('Distribution of NObeyesdad')
plt.xticks(rotation = 45)
plt.xlabel('NObeyesdad')
plt.ylabel('Count')
plt.show()

## Data Preprocessing

In [None]:
categoricalColumns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

combined = pd.concat([train, test], axis = 0, ignore_index = True)

labelEncoders = {}

for column in categoricalColumns:
    le = LabelEncoder()
    combined[column] = le.fit_transform(combined[column])
    labelEncoders[column] = le
    
train = combined[:len(train)]
test = combined[len(train):]


In [None]:
X = train.drop(columns=['NObeyesdad'])
Y = train['NObeyesdad']

In [None]:
Y.head

In [None]:
label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)


In [None]:
scaler = StandardScaler()
numericalColumns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
X.loc[:,numericalColumns] = scaler.fit_transform(X.loc[:,numericalColumns])
test.loc[:,numericalColumns] = scaler.transform(test.loc[:,numericalColumns])

## Model Building

### Training 

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(set(Y_train)),
    n_estimators=1000,
    max_depth=10,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X, Y_encoded)
PredictionsXGB = xgb_model.predict_proba(test.drop(columns=['NObeyesdad']))

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=1)
rf_model.fit(X, Y_encoded)
PredictionsRF = rf_model.predict_proba(test.drop(columns=['NObeyesdad']))

In [None]:
ensemblePredictions = (PredictionsXGB + PredictionsRF) / 2
predicted_labels = label_encoder.inverse_transform(np.argmax(ensemblePredictions, axis=1))

In [None]:


smth = pd.read_csv("/kaggle/input/thapar-summer-school-2024-competition-2/test.csv/test.csv")

Submission = pd.DataFrame({
    'id': smth['id'], 
    'NObeyesdad': predicted_labels
})

Submission.to_csv('Submission.csv', index = False)