In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data importing

In [None]:
train = pd.read_csv("/kaggle/input/credit-card-behaviour-score/Dev_data_to_be_shared.csv")
test_val = pd.read_csv("/kaggle/input/credit-card-behaviour-score/validation_data_to_be_shared.csv")

In [None]:
train.head()

In [None]:
train.bad_flag.value_counts()

In [None]:
print(train.dtypes)


In [None]:
df_num = train.select_dtypes(int, float)
df_num

In [None]:
columns_to_drop = [col for col in df_num.columns if train[col].nunique() == 1]

columns_to_drop

In [None]:
train.onus_attribute_28.value_counts()

In [None]:
df_num.drop("onus_attribute_28", axis = 1, inplace = True)

In [None]:
y = df_num.bad_flag
X = df_num.drop("bad_flag", axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [None]:
# Output results
print("Logistic Regression Results")
print("----------------------------")
print(f"Accuracy: {accuracy:.2f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(report)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE

In [None]:
# Apply SMOTE to balance the training dataset

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X, y)

print("Class Distribution After SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
# Split the dataset into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train_smote, y_train_smote, test_size=0.3, random_state=42)

In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(Xtrain, ytrain)

In [None]:
y_pred_new = model.predict(Xtest)
y_pred_proba = model.predict_proba(Xtest)[:, 1]

# Evaluate the model
print("\nClassification Report:")
print(classification_report(ytest, y_pred_new))

print("\nConfusion Matrix:")
print(confusion_matrix(ytest, y_pred_new))

# Compute ROC-AUC score
roc_auc = roc_auc_score(ytest, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.2f}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(Xtrain, ytrain)

In [None]:
y_pred_new = rf.predict(Xtest)
y_pred_proba = rf.predict_proba(Xtest)[:, 1]

In [None]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(ytest, y_pred_new))

print("\nConfusion Matrix:")
print(confusion_matrix(ytest, y_pred_new))

In [None]:
 #Compute ROC-AUC score
roc_auc = roc_auc_score(ytest, y_pred_proba)
print(f"\nROC-AUC Score: {roc_auc:.2f}")

In [None]:
from sklearn.metrics import precision_recall_curve

# Adjust decision threshold
precision, recall, thresholds = precision_recall_curve(ytest, y_pred_proba)

# Find the threshold with a good balance
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]

print(f"\nOptimal Threshold for F1-Score: {optimal_threshold:.2f}")

In [None]:
# Make predictions with the new threshold
y_pred_new_threshold = (y_pred_proba >= optimal_threshold).astype(int)

# Reevaluate the model
print("\nClassification Report with Tuned Threshold:")
print(classification_report(ytest, y_pred_new_threshold))

print("\nConfusion Matrix with Tuned Threshold:")
print(confusion_matrix(ytest, y_pred_new_threshold))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

# Plot ROC Curve
fpr, tpr, _ = roc_curve(ytest, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.2f})", color='blue')
plt.plot([0, 1], [0, 1], 'k--', label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.show()
