In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'playground-series-s4e1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F65711%2F7405009%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240502%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240502T160442Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D05eeea776e858a464fd73fb4884ab89cc17e0bb028d004f108791a11900a60327408eae5b25f48fdc1e14817799775b515e24629d764ee254d1160f97132f954b9a4047cc382cbf252597d4039cfc9baf4797c7d6af624a8faa90216049b69ae3f90c6f4499527a59d58308b2678bc6ad8253751ace71b6a8d156c9265c30cb6a9d60056d5811abfa30becbb9ee17f2e05b0bdff1cee0897eb1d040a3714c6db34ce87300018cf62994bb54d9932e23541ae0a245b451c7898d93e8fec9d0c26c749de19cfa6b968707d6c81f0b3bddf97643deca56e196bba4baa12d3094541c0cb0f49c1db9a32bc7157971d162c2a14d7c783ff0c115a5540046b7ba9534f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer,make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import statsmodels.api as sm
from statsmodels.formula.api import ols

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e1/test.csv')
df.head()

# Explanatory Data Analysis (EDA)

## Check for missing values or Duplicates values.

In [None]:
duplicates = df.duplicated()

# Check for duplicated values
if duplicates.empty:
    print("We have duplicates so we drop them.")
    # Drop duplicates values.
    df.drop_duplicates(inplace = True)
else:
    print("We don't have duplicates.")

# Check for missing values.
if df.isna().any().any():
    print("We have missing values")
else:
    print("We don't have missing values!!")

## Check the data types of the dataset.

In [None]:
df.dtypes

## Change the data type of some columns

In [None]:
# Create a list of column who want to convert to categorical.
categorical_columns = ['Geography','Gender','NumOfProducts','HasCrCard','IsActiveMember','Tenure']

# Our target category.
target = 'Exited'
df[target] = df[target].astype('str')

# Convert to categorical.
for column in categorical_columns:
    df[column] = df[column].astype('str')
df.info()

## Return some statistics for the categorical data.
### From the above results we can see some unpredictable statistics that we must check through our analysis.
### We must check the data distribution on the following columns:
- Age,
- Balance
- EstimateSalary

In [None]:
df.describe()

## Visualize Data

### Age Distribution.

In [None]:
# Check the distibution of the Age columns.
sns.histplot(data = df, x ='Age',bins = 20,color='green',edgecolor='black',kde=True)

# Calculate mean and median
mean_value = df["Age"].mean()
median_value = df["Age"].median()

# Add mean and median lines
plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')
plt.axvline(median_value, color='blue', linestyle='dashed', linewidth=2, label=f'Median: {median_value:.2f}')

# Rename the axis.
plt.title("Distribution of Age.")
plt.xlabel('Age')
plt.ylabel("Count")
plt.legend()

## Distribution of the Balance column.

In [None]:
# Check the distibution of the Age columns.
sns.histplot(data = df, x ='Balance',bins = 20,color='green',edgecolor='#26090b',kde=True)

# Calculate mean and median
mean_value = df["Balance"].mean()
median_value = df["Balance"].median()

# Add mean and median lines
plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')
plt.axvline(median_value, color='blue', linestyle='dashed', linewidth=2, label=f'Median: {median_value:.2f}')

# Rename the axis.
plt.title("Distribution of BalanceBalance.")
plt.xlabel('Balance')
plt.ylabel("Count")
plt.legend()

## Distribution of the Estimated Salary

In [None]:
# Check the distibution of the Age columns.
sns.histplot(data = df, x ='EstimatedSalary',bins = 20,color='green',edgecolor='#26090b',kde=True)

# Calculate mean and median
mean_value = df["EstimatedSalary"].mean()
median_value = df["EstimatedSalary"].median()

# Add mean and median lines
plt.axvline(mean_value, color='red', linestyle='dashed', linewidth=2, label=f'Mean: {mean_value:.2f}')
plt.axvline(median_value, color='blue', linestyle='dashed', linewidth=2, label=f'Median: {median_value:.2f}')

# Rename the axis.
plt.title("Distribution of EstimatedSalary.")
plt.xlabel('EstimatedSalary')
plt.ylabel("Count")
plt.legend()

In [None]:
fig = plt.figure(figsize=(14, len(categorical_columns)*3))

# background_color = 'grey'
for i, col in enumerate(categorical_columns):

    plt.subplot(len(categorical_columns)//2 + len(categorical_columns) % 2, 2, i+1)
    sns.countplot(x=col, hue=target, data=df,)
    plt.title(f"{col} countplot by target", fontweight = 'bold')
    plt.ylim(0, df[col].value_counts().max() + 10)

plt.tight_layout()
plt.show()

## Create a new dataset, and  drop id, CustomerId and SurnName

In [None]:
dataset = df.drop(['id','CustomerId','Surname'], axis = 1)
dataset.head()

In [None]:
df.info()

## Correlation Heatmap.
### From the following heatmap we can easily said that the most correlated columns is Gender, Age, Balance and IsActiveMember.
### For the other columns, we must apply logistic regression or correaltion test to see if it statistically significant.

In [None]:
catcol = [col for col in dataset.columns if dataset[col].dtype == "object"]
le = LabelEncoder()
for col in catcol:
        dataset[col] = le.fit_transform(dataset[col])


plt.subplots(figsize =(10, 10))

sns.heatmap(dataset.corr(), square=True, cbar_kws=dict(shrink =.82),
            annot=True, vmin=-1, vmax=1, linewidths=3,linecolor='#e0b583',annot_kws=dict(fontsize =12))
plt.title("Pearson Correlation Of Features\n", fontsize=25)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

## We have to apply a test to see if some columns are statistically significant
### You can use anova test or logistic regressor.

In [None]:
# Create a column to check
columns_to_check = ['CreditScore','EstimatedSalary','HasCrCard','Tenure','Geography','CreditScore']

# Anova test
for column in columns_to_check:
    # Fit an Ordinary Least Squares (OLS) model
    formula = f'Exited ~ {column}'
    model = ols(formula, data=dataset).fit()

    # Perform ANOVA
    anova_table = sm.stats.anova_lm(model, typ=2)

    print(f"ANOVA results for {column}:")
    print(anova_table)
    print("\n")


In [None]:
# Logistic regression model.
for column in columns_to_check:
    # Add a constant term to the predictors
    predictors = sm.add_constant(dataset[column])

    # Fit logistic regression model
    logit_model = sm.Logit(dataset['Exited'], predictors)
    result = logit_model.fit()

    # Print summary of the logistic regression model
    print(f"Logistic regression results for {column}:")
    print(result.summary())
    print('\n')


### All the p-values are close to zero, so this columns is statistical significant with the target variable.

In [None]:
df.drop(['id','CustomerId','Surname'], axis = 1,inplace = True)
df[target] = df[target].astype('int')


In [None]:
df.info()

# Prepare for modeling


In [None]:
# Dependent and Independent variable.
X = df.drop(['Exited'],axis = 1)
y = df['Exited']

# Split the dataset.
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.75, random_state = 42)

# Create a transformation
transformer = ColumnTransformer([
    ('categorical_encoding', OneHotEncoder(sparse = False, drop ='if_binary'),make_column_selector(dtype_include='object')) ,
    ('numerical_transformation', StandardScaler(),make_column_selector(dtype_exclude='object'))
])

# Apply transformation
X_train_scaled = transformer.fit_transform(X_train)
X_test_scaled = transformer.transform(X_test)

# Get the name of new columns.
new_categorical_column_names = transformer.named_transformers_['categorical_encoding'] \
                                 .get_feature_names_out()

# Get the names of the numerical columns (which remain the same after transformation)
numerical_column_names = transformer.transformers_[1][2]

# Concatenate the new column names
new_column_names = list(new_categorical_column_names) + list(numerical_column_names)

# Create a DataFrame with the transformed data and new column names
transformed_df = pd.DataFrame(X_train_scaled, columns=new_column_names)

In [None]:
# Define and train the Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_scaled, y_train)

# Predict using the trained classifier
y_pred = random_forest.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


# Building an Artificial Neural Network

In [None]:
ann = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units = 6, activation = 'relu'),
    tf.keras.layers.Dense(units = 12, activation = 'relu'),
    tf.keras.layers.Dense(units = 12, activation = 'relu'),
    tf.keras.layers.Dense(units = 6, activation = 'relu'),
    tf.keras.layers.Dense(units = 1, activation = 'sigmoid')
])

ann.compile(optimizer = 'adam', loss= 'binary_crossentropy', metrics = ['accuracy'])

ann.fit(X_train_scaled,y_train, batch_size = 128 ,epochs = 30)

In [None]:
# Convert to categorical.
for column in categorical_columns:
    test_df[column] = test_df[column].astype('str')
test_df.info()

# Scaled test data
X_submission_scaled = transformer.transform(test_df)
predictions = ann.predict(X_submission_scaled)


In [None]:
df_submission = pd.read_csv('/kaggle/input/playground-series-s4e1/sample_submission.csv')
df_submission['Exited']=predictions
df_submission.to_csv("submission.csv", index=False)
df_submission.head(10)