importing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from scipy.stats import kurtosis, skew
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score

Reading data

In [None]:
data = pd.read_csv(r"C:\Users\dimas\OneDrive\Studying Data\archive\credit_risk_dataset.csv")
data0 = data

taking a peek on it

In [None]:
print(data.head())
print(data.dtypes)
data.shape

deleting duplicates or data with missing values

In [None]:
data=data.drop_duplicates().dropna()

figuring out if number of both values of response variable is equal

In [None]:
num_st = data['loan_status'].value_counts()
plt.pie(num_st)

changing type to boolean for two variables

In [None]:
mapping = {"Y": True, "N": False}
data['cb_person_default_on_file']=data['cb_person_default_on_file'].replace(mapping).astype('bool')
mapping1 = {"1": True, "0": False}
data['loan_status'] = data['loan_status'].replace(mapping1).astype('bool')

dropping negative values(they are not expected due to the nature of the dataset

In [None]:
data_numerical = data.select_dtypes(include=['int','float'])
data_non_numerical = data.select_dtypes(exclude=['int', 'float'])
negative_mask = data_numerical < 0
data_numerical = data_numerical[~negative_mask]
data = data_numerical.join(data_non_numerical)

In [None]:
data.dtypes

In [None]:
print(data_numerical.columns)
print(data.columns)
print(data_non_numerical.columns)

removing non-variant features

In [None]:
selector = VarianceThreshold()
selector.fit(data_numerical)
mask = selector.get_support()
data_numerical = data_numerical[data_numerical.columns[mask]]
data = pd.concat([data_numerical, data_non_numerical], axis=1)

figuring out if there is dominantly correlated with variable variable(turns out there isn't)

In [None]:
plt.figure(figsize = (14,14))
plt.title('Credit Risk correlation plot (Pearson)')
corr = data.corr()
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,linewidths=.1,cmap="Greens")
plt.show()

getting curtosis and skewness of uncleaned from outliers data(it's mostly close to being normal with some exceptions)

In [None]:
numerical_columns = data.select_dtypes(include=['int', 'float','bool']).columns
for col in numerical_columns:
    print(f"Column: {col}")
    print(f"Kurtosis: {kurtosis(data[col])}")
    print(f"Skewness: {skew(data[col])}")

During the exploration of the data,was found out that the most suitable for being response variable would be loan_status
Were noticed outliers in practically every numerical columns.Also,most of them are right-skewed,which possibly may mean that outliers tend to be bigger than the
mean.Using boxplot has supported that hypothesis.There doesn't seem to be any dominate independent feature(in terms of correlation with response variable)and
there are not any non-variate which would need to be removed.Now it is considerable to move into data cleaning(which I have already started by dropping empty values and duplicates(which was the more acceptable decision than imputation/prediction in my case)

In [None]:
data.dtypes

defining function to remove outliers

In [None]:
def remove_outliers_zscore(df, threshold):
    df_numerical = df.select_dtypes(include=['int', 'float', 'bool'])
    df_non_numerical = df.drop(df_numerical.columns, axis=1)
    z_scores = abs(df_numerical - df_numerical.mean()) / abs(df_numerical.std())
    df_numerical = df_numerical[(z_scores < threshold) & (z_scores > -threshold)]
    df = pd.concat([df_numerical, df_non_numerical], axis=1)
    return df

removing outliers

In [None]:
data = remove_outliers_zscore(data,3)

getting kurtosis and skew for the cleaned data

In [None]:
numerical_columns = data.select_dtypes(include=['int', 'float','bool']).columns
for col in numerical_columns:
    print(f"Column: {col}")
    print(f"Kurtosis: {kurtosis(data[col])}")
    print(f"Skewness: {skew(data[col])}")

In [None]:
data.dtypes

scaling data

In [None]:
def scale_data(df):
    df_scaled = df.copy()
    numerical_columns = df.select_dtypes(include=['int', 'float', 'bool']).columns
    scaler = StandardScaler()
    df_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    original_df_types = df.dtypes
    df_scaled = df_scaled.astype(original_df_types)
    return df_scaled

In [None]:
scale_data(data)

In [None]:
data.dtypes

getting current distributions of cleaned data

In [None]:
plt.figure(figsize=(20,3))
sns.boxplot(data=data)
plt.show()

In [None]:
print(data.shape)
print(data.shape)
print([data.describe(),data.describe()])

getting another look at distributions

In [None]:
def histplot_every_column(data):
 for column in data.columns:
    plt.figure()
    plt.hist(data[column])
    plt.axvline(data[column].mean(), color='r', linestyle='dashed', linewidth=2)
    plt.axvline(data[column].median(), color='g', linestyle='dashed', linewidth=2)
    plt.title(column + " Histogram")
    plt.show()

In [None]:
histplot_every_column(data=data)

In [None]:
print(data.shape)

one-hot encoding dataset

In [None]:
encoded_data = data.copy()
object_columns = data.select_dtypes(include='object').columns
encoded_data = pd.get_dummies(encoded_data, columns=object_columns, prefix=object_columns, prefix_sep='_')
print(encoded_data)

In [None]:
data = encoded_data

In [None]:
def convert_uint8_to_boolean(data):
    uint8_columns = data.select_dtypes(include='uint8').columns
    def convert_uint8_to_boolean(x):
        if x == 0:
            return False
        else:
            return True
    data[uint8_columns] = data[uint8_columns].applymap(convert_uint8_to_boolean)
    data[uint8_columns] = data[uint8_columns].astype(bool)
    return data
data = convert_uint8_to_boolean(data)

In [None]:
data.dtypes

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

creating,training,testing and evaluating model

In [80]:
rf = RandomForestClassifier(n_estimators=19)
rf.fit(X_train, y_train)
y_pred_proba = rf.predict_proba(X_test)
y_pred = (y_pred_proba[:, 1] >= 0.3).astype(int)
cm = confusion_matrix(y_test, y_pred)
print(cm)
f1 = f1_score(y_test, y_pred)
print(f1)

[[4190  251]
 [ 295  965]]
0.7794830371567044


In [73]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
f1 = f1_score(y_test, y_pred)
print(f1)

[[4378   63]
 [1084  176]]
0.23482321547698465


In [None]:
git config