In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
# !pip install missingno
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif


In [35]:
# Load the datasets
def load_application_train():
    data = pd.read_csv("application_train.csv")
    return data

def load():
    data = pd.read_csv("titanic.csv")
    return data

In [36]:
# See the shape of the datasets
df_application = load_application_train()
print(df_application.shape)  # (307511, 122)

df_titanic = load()
print(df_titanic.shape)  # (891, 12)

(9692, 122)
(891, 12)


In [37]:
# Data Preprocessing for application_train dataset

# Handle Missing Values
# For numerical features
num_imputer = SimpleImputer(strategy='mean')
numerical_columns = df_application.select_dtypes(include=[np.number]).columns
df_application[numerical_columns] = num_imputer.fit_transform(df_application[numerical_columns])

# For categorical features
cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_columns = df_application.select_dtypes(include=[object]).columns
df_application[categorical_columns] = cat_imputer.fit_transform(df_application[categorical_columns])

# Encode Categorical Variables
label_encoder = LabelEncoder()
for column in categorical_columns:
    df_application[column] = label_encoder.fit_transform(df_application[column].astype(str))

# Detect and Remove Outliers using LOF
lof = LocalOutlierFactor()
outliers = lof.fit_predict(df_application[numerical_columns])
mask = outliers != -1
df_application = df_application[mask]

# Scale Numerical Features
scaler = StandardScaler()
df_application[numerical_columns] = scaler.fit_transform(df_application[numerical_columns])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_application[numerical_columns] = scaler.fit_transform(df_application[numerical_columns])


In [38]:
# Feature Selection (if needed)
X_application = df_application.drop('TARGET', axis=1)
y_application = df_application['TARGET']

best_features_application = SelectKBest(score_func=f_classif, k='all')
fit_application = best_features_application.fit(X_application, y_application)
df_scores_application = pd.DataFrame(fit_application.scores_)
df_columns_application = pd.DataFrame(X_application.columns)

feature_scores_application = pd.concat([df_columns_application, df_scores_application], axis=1)
feature_scores_application.columns = ['Feature', 'Score']
print(feature_scores_application)

                        Feature      Score
0                    SK_ID_CURR   0.221707
1            NAME_CONTRACT_TYPE  14.834037
2                   CODE_GENDER  32.747388
3                  FLAG_OWN_CAR   1.789539
4               FLAG_OWN_REALTY   2.509070
..                          ...        ...
116   AMT_REQ_CREDIT_BUREAU_DAY   0.554292
117  AMT_REQ_CREDIT_BUREAU_WEEK   0.274282
118   AMT_REQ_CREDIT_BUREAU_MON   2.136112
119   AMT_REQ_CREDIT_BUREAU_QRT   0.162848
120  AMT_REQ_CREDIT_BUREAU_YEAR   0.660268

[121 rows x 2 columns]


  f = msb / msw


In [39]:
# Final Data Preparation
X_application = df_application.drop('TARGET', axis=1)
y_application = df_application['TARGET']


In [40]:
# Data Preprocessing for titanic dataset

# Handle Missing Values
# For numerical features
numerical_columns_titanic = df_titanic.select_dtypes(include=[np.number]).columns
df_titanic[numerical_columns_titanic] = num_imputer.fit_transform(df_titanic[numerical_columns_titanic])

# For categorical features
categorical_columns_titanic = df_titanic.select_dtypes(include=[object]).columns
df_titanic[categorical_columns_titanic] = cat_imputer.fit_transform(df_titanic[categorical_columns_titanic])

# Encode Categorical Variables
for column in categorical_columns_titanic:
    df_titanic[column] = label_encoder.fit_transform(df_titanic[column].astype(str))

# Detect and Remove Outliers using LOF
outliers_titanic = lof.fit_predict(df_titanic[numerical_columns_titanic])
mask_titanic = outliers_titanic != -1
df_titanic = df_titanic[mask_titanic]

# Scale Numerical Features
df_titanic[numerical_columns_titanic] = scaler.fit_transform(df_titanic[numerical_columns_titanic])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_titanic[numerical_columns_titanic] = scaler.fit_transform(df_titanic[numerical_columns_titanic])


In [41]:
# Feature Selection (if needed)
X_titanic = df_titanic.drop('Survived', axis=1)
y_titanic = df_titanic['Survived']

best_features_titanic = SelectKBest(score_func=f_classif, k='all')
fit_titanic = best_features_titanic.fit(X_titanic, y_titanic)
df_scores_titanic = pd.DataFrame(fit_titanic.scores_)
df_columns_titanic = pd.DataFrame(X_titanic.columns)

feature_scores_titanic = pd.concat([df_columns_titanic, df_scores_titanic], axis=1)
feature_scores_titanic.columns = ['Feature', 'Score']
print(feature_scores_titanic)

        Feature       Score
0   PassengerId    0.289820
1        Pclass   86.048540
2          Name    2.787366
3           Sex  324.103107
4           Age    6.483032
5         SibSp    2.180161
6         Parch    5.746599
7        Ticket   22.830404
8          Fare   55.558396
9         Cabin   31.375437
10     Embarked   19.793207


In [42]:
# Final Data Preparation
X_titanic = df_titanic.drop('Survived', axis=1)
y_titanic = df_titanic['Survived']