### Import the Libraries, Data and Have a Look at the Data!

In [24]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import tensorflow as tf
from tensorflow.keras import regularizers
from pandas_profiling import ProfileReport
import warnings

In [25]:
train_df = pd.read_csv(r"data\train.csv")
test_df = pd.read_csv(r"data\test.csv")

### Data Cleaning

In [26]:
def missing_values(df):
    """ This function takes a dataframe as input and returns a dataframe with missing values filled with median/mode and a list of dropped columns. Also, it plots the missing values in a bar chart."""

    ### Plotting missing values
    missing_vals = df.isnull().sum()
    missing_vals = missing_vals[missing_vals > 0]
    missing_vals.sort_values(inplace=True)
    missing_vals.plot.bar()

    data_filtered = df.dropna(thresh=len(df) * (1-.05), axis=1).copy()
    dropped_columns = set(df.columns.tolist()) - set(data_filtered.columns.tolist())

    # Fill missing numeric values with median
    numeric_features = data_filtered.select_dtypes(include=[np.number])
    for col in numeric_features.columns:
        if data_filtered[col].isnull().any():
            data_filtered.loc[:, col] = data_filtered[col].fillna(data_filtered[col].median())

    # Fill missing categorical values with mode
    categorical_features = data_filtered.select_dtypes(exclude=[np.number])
    for col in categorical_features.columns:
        if data_filtered[col].isnull().any():
            data_filtered.loc[:, col] = data_filtered[col].fillna(data_filtered[col].mode()[0])

    # Check for missing values and report how many columns have been dropped
    print(f"The following columns have been dropped: {dropped_columns} and the remaining missing data is filled using median/mode.")

    return data_filtered, dropped_columns

# Call the function
train_df, dropped_columns = missing_values(train_df)
print(f"The Shape of the train_df is: {train_df.shape}")

The following columns have been dropped: {'BQ', 'EL'} and the remaining missing data is filled using median/mode.
The Shape of the train_df is: (617, 56)


### Data Exploration

In [27]:
### splitting the data into numerical and categorical features and target
train_target = train_df['Class']
train_cat_features = train_df['EJ']
train_num_features = train_df.drop(['Id', 'Class', 'EJ'], axis=1)

### Standardizing the numerical features
num_cols = train_num_features.columns.tolist()
scaler = StandardScaler()
train_num_features = scaler.fit_transform(train_num_features)
train_num_features = pd.DataFrame(train_num_features, columns=num_cols)

### One-hot encoding the categorical features
train_cat_features = pd.get_dummies(train_cat_features)

### Concatenating the numerical and categorical features
train_features = pd.concat([train_num_features, train_cat_features], axis=1)

### Concatenating the features and target
train_data = pd.concat([train_features, train_target], axis=1)

#### Dealing with Outliers

In [28]:
### stopping the warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import IsolationForest

def run_isolation_forest(df, contamination=0.025, max_features=2):
    # Fit the model
    isof = IsolationForest(random_state=42, contamination=contamination, max_features=max_features).fit(df)
    results = isof.predict(df)
    df['isolation scores'] = results
    df['is_outlier'] = np.where(df['isolation scores'] == -1, 1, 0)
    print(f"{len(df[df['is_outlier'] == 1])} Outliers Detected")
    df = df.drop(['isolation scores'], axis=1)
    
    return df

### Running the isolation forest
### Specify the contamination and max_features
train_data = run_isolation_forest(train_data, contamination=0.05, max_features=5)

### Plotting the outliers
sns.pairplot(train_data[["is_outlier", "AB", 'AH', 'AM', 'CL', 'CR', 'CS']], hue="is_outlier")
plt.show()

### Dropping the outliers
train_data = train_data[train_data['is_outlier'] == 0]
train_data = train_data.drop(['is_outlier'], axis=1)
print(f"Shape of the dataframe after dropping outliers: {train_data.shape}")

31 Outliers Detected
Shape of the dataframe after dropping outliers: (586, 56)


In [30]:
# Get all the columns except 'Class'
columns = train_data.columns.tolist()

# Calculate the number of rows needed for subplots
n = len(columns)
ncols = 2
nrows = n // ncols + (n % ncols > 0)

# Create a figure and axes for subplots
fig, axs = plt.subplots(nrows, ncols, figsize=(10, 4*nrows))  # Adjust as per your requirement
axs = axs.flatten()  # to make it easier to iterate over

# For each column and its corresponding subplot, create a boxplot
for ax, col in zip(axs, columns):
    sns.boxplot(data=train_data, x='Class', y=col, ax=ax)
    ax.set_title(f'Class vs {col}')

# If there are less columns than subplots, remove the extra ones
for i in range(n, nrows*ncols):
    fig.delaxes(axs[i])

# Show the plot
plt.tight_layout()
plt.show()

After data visualization, here are some takeaways:
1. There are no strong relations of any variable with the output. So, it would be difficult to correctly classify output, especially when there is so small dataset.
2. There are many outliers in the dataset that do no follow the general patterns of the data. So, we are going to cap the data to 0.05 and 0.95 percentile.

### Data Pre-processing

In [7]:
train_features = train_data.drop(['Class'], axis=1)
train_target = train_data['Class']

In [8]:
### Winsorizing the data
import pandas as pd

def winsorize_df(df):
    for col in df.columns:
        if df[col].dtype.kind in 'biufc':
            lower = df[col].quantile(0.02)
            upper = df[col].quantile(0.98)
            df[col] = df[col].clip(lower, upper)
    return df

train_features_capped = train_features.copy()
train_features_capped = winsorize_df(train_features_capped)


## Data Modeling

In [9]:
# Split the data into train and test sets
y = train_data['Class']
X = train_features

# Split the data into train and validation sets

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=22)

# Convert the data to TensorFlow tensors
X_train = tf.convert_to_tensor(X_train, dtype=tf.float32)
X_val = tf.convert_to_tensor(X_val, dtype=tf.float32)

### Random Forest

After applying Gradient Boosing Classifier, Random Forest Classifier and Ada Boost Classifier, it was found that Random Forest outperformed other trees and thus, only RFs predictions are shown here.

In [10]:
# Applying LGBM Classifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Create a Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=22, max_depth=5)

# Train the model using the training sets
rf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = rf.predict(X_val)

# Plotting the probability distribution of the predictions
plt.figure(figsize=(6, 3))
plt.subplot(1, 2, 1)
sns.distplot(y_pred, bins=20, kde=False)
plt.title('Probability Distribution of Predictions', size=6)
plt.show()

### Plotting a confusion matrix
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()

### Getting the precision and recall scores
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       101
           1       0.62      0.29      0.40        17

    accuracy                           0.87       118
   macro avg       0.76      0.63      0.66       118
weighted avg       0.85      0.87      0.85       118



In [11]:
### Printing the feature importance of 10 most important features
feature_importances = rf.feature_importances_
feature_importances = pd.DataFrame({'feature': list(train_features.columns), 'importance': feature_importances}).sort_values('importance', ascending=False)
feature_importances.head()

Unnamed: 0,feature,importance
31,DU,0.109032
44,FL,0.090272
52,GL,0.076823
20,CR,0.069184
37,EH,0.047829


### MLP Neural Network

In [31]:
### Creating a MLP neural network model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow import keras


# Create a Sequential model
model = Sequential()

# Add regularization (l2) to the layers
model.add(Dense(100, activation='relu', input_dim=X_train.shape[1], kernel_regularizer=regularizers.l2(0.015)))
model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.05)))
model.add(Dense(50, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
# Use sigmoid activation for binary classification
model.add(Dense(1, activation='sigmoid'))

# Add gradient clipping to Adam optimizer
adam = Adam(learning_rate=.0001, clipvalue=0.5)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

# Add early stopping to prevent overfitting
early_stopping = keras.callbacks.EarlyStopping(monitor='loss', patience=3)

# Train the model
history = model.fit(X_train, y_train, epochs=300, batch_size=5, validation_data=(X_val, y_val), callbacks=[early_stopping], verbose=0, class_weight={0:1, 1:1})

In [18]:
# Plot training & validation accuracy values
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.tight_layout()
plt.show()

In [19]:
# Predicting the test set results
y_pred = model.predict(X_val)

# Plotting the probability distribution of the predictions
plt.figure(figsize=(6, 3))
plt.subplot(1, 2, 1)
sns.distplot(y_pred, bins=20, kde=False)
plt.title('Probability Distribution of Predictions', size=6)

y_pred = (y_pred > 0.5)

In [20]:
### Plotting a confusion matrix
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()

### Getting the precision and recall scores
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       101
           1       0.92      0.71      0.80        17

    accuracy                           0.95       118
   macro avg       0.94      0.85      0.89       118
weighted avg       0.95      0.95      0.95       118



In [21]:
### Prepare the test data
test_num_features = test_df.drop(['Id', 'EJ', 'BQ', 'EL'], axis=1)
test_cat_features = test_df['EJ']

# Scale the numerical features
num_cols = test_num_features.columns
test_num_features = scaler.transform(test_num_features)
test_num_features = pd.DataFrame(test_num_features, columns=num_cols)

# Convert the categorical features to one-hot encoded features
test_cat_features = pd.get_dummies(test_cat_features)
test_cat_features['B'] = 0

# Concatenate the numerical and categorical features
test_features = np.concatenate((test_num_features, test_cat_features), axis=1)

# Convert the data to TensorFlow tensors
test_features = tf.convert_to_tensor(test_features, dtype=tf.float32)

# Predict the test set results
y_pred = model.predict(test_features)

# Plotting the probability distribution of the predictions
plt.figure(figsize=(6, 3))
plt.subplot(1, 2, 1)
sns.distplot(y_pred, bins=20, kde=False)
plt.title('Probability Distribution of Predictions', size=6)
plt.show()

In [22]:
### Creating a submission file with column names 'Id', 'Class0', 'Class1'
submission_df = pd.DataFrame(columns=['Id', 'class_0', 'class_1'])
submission_df['Id'] = test_df['Id']
submission_df['class_0'] = 1 - y_pred
submission_df['class_1'] = y_pred

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

### Results

1. It was surprising to see Neural Network outperform the decision trees in this problem.
2. A precision of 97% was achieved with the current neural network and data.