In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import seaborn as sns
from scipy import stats
import missingno as msno

## 1. Load Dataset

In [None]:
dataset_filepath = 'data/Polish_Banks/polish_banks_dataset.csv'
attribute_names_file_path = 'data/Polish_Banks/attr_names.pickle'
plots_save_dir = 'plots_removed_outliers/'

In [None]:
df = pd.read_csv(dataset_filepath)
with open(attribute_names_file_path, 'rb') as f:
    attr_names = pickle.load(f)

if not os.path.exists(plots_save_dir):
    os.makedirs(plots_save_dir)

## 2. Plots

In [None]:
df = df.drop('Unnamed: 0', axis='columns')

In [None]:
attr_names['Attr37']

In [None]:
display(df.describe().T)

datatypes = pd.DataFrame(df.dtypes, columns=["type"]).reset_index()
datatypes.groupby("type").count().plot(kind='bar', legend=False)
object_values = datatypes[datatypes["type"] == "object"]["index"].values
df[object_values]

non_object_values = datatypes[datatypes["type"] != "object"]["index"].values


df.nunique().sort_values()

msno.matrix(df)
msno.bar(df)

attributes_with_missing_data = df.isnull().sum(axis=0)
attributes_with_missing_data = attributes_with_missing_data[attributes_with_missing_data > 0]
attributes_with_missing_data = pd.DataFrame(attributes_with_missing_data, columns=["num_nans"])
attributes_with_missing_data.sort_values("num_nans", ascending=False)

print('=========')
# z score filtering is shit as mean and std are heavily skewed
df_no_obj_val = df[non_object_values]
z_score_abs = np.abs((df_no_obj_val - df_no_obj_val.mean())/df_no_obj_val.std(ddof=0))
z_score_abs.to_csv('z_score')
df_new = df_no_obj_val[~((z_score_abs > 3).any(axis=1))]
display(df_new)
display(df_new.describe().T)


# try quantile filtering
# Calculate quantiles and IQR
Q1 = df_no_obj_val.quantile(0.01) # Same as np.percentile but maps (0,1) and not (0,100)
Q3 = df_no_obj_val.quantile(0.99)
IQR = Q3 - Q1

# Return a boolean array of the rows with (any) non-outlier column values
condition = ~((df_no_obj_val < (Q1 - 1.5 * IQR)) | (df_no_obj_val > (Q3 + 1.5 * IQR))).any(axis=1)

# Filter our dataframe based on condition
df_new = df_no_obj_val[condition]
display(df_new)
display(df_new.describe().T)



In [None]:
print(df.Attr5.max())
print(df[df.Attr5 == 155870.0])
print(df_new.Attr5.max())

In [None]:
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
colnames = [col for col in df.columns]
# takes 10 min to run
# for colname_x in colnames:
#     for colname_y in colnames:
#         plt.figure()
#         if colname_x == colname_y and colname_x != 'class':
#             plot = sns.kdeplot(data=df, x=colname_x)
#             fig = plot.get_figure()
#             fig.savefig(f"{plots_save_dir}/kdeplot_{colname_x}")
#         else:
#             plot = sns.scatterplot(data=df, x=colname_x, y=colname_y)
#             fig = plot.get_figure()
#             fig.savefig(f"{plots_save_dir}/scatterplot_{colname_x}_{colname_y}")
#         plt.close()

for colname in colnames:
        plt.figure()
        if colname == 'class':
                continue
        plot = sns.kdeplot(data=df_new, x=colname)
        fig = plot.get_figure()
        fig.savefig(f"{plots_save_dir}/kdeplot_{colname}")
        plt.close()

In [None]:
for colname in colnames:
    if colname == 'class':
        continue
    plot = sns.violinplot(data=df_new, x=colname)
    fig = plot.get_figure()
    fig.savefig(f"{plots_save_dir}/violinplot_{colname}")
    plt.close()