In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
plt.style.use("ggplot")
import warnings
warnings.filterwarnings("ignore")


In [None]:
df=pd.read_csv("UCI_Credit_Card.csv")

In [None]:
df.head()

In [None]:
df.info()


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.nunique()

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True,fmt=".2f", cmap='coolwarm')
plt.title("Correlation heatmap")
plt.show()

In [None]:
payment_history_columns = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

for col in payment_history_columns:
  sns.countplot(x=col, hue="default.payment.next.month",data=df)
  plt.title(f"Default vs {col}")
  plt.xlabel("Bill amount")
  plt.show()

In [None]:
for i in range(1,7):
  plt.figure(figsize=(20,10))
  plt.subplot(1,2,1)
  sns.histplot(df[f"BILL_AMT{i}"],bins=30,kde=True)
  plt.title(f"Bill Amount {i} Distributions")
  plt.xlabel("Bill Amount")
  plt.ylabel("Counts")

  plt.subplot(1,2,2)
  sns.histplot(df[f"PAY_AMT{i}"],bins=30,kde=True)
  plt.title(f"Payment Amount {i} Distributions")
  plt.xlabel("Payment Amount")
  plt.ylabel("Counts")

  plt.show()

In [None]:
sns.boxplot(x="default.payment.next.month",y="AGE",data=df)
plt.title("AGE VS DEFAULT")
plt.xlabel("DEFAULT PAYMENT")
plt.ylabel("AGE")
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.histplot(df["AGE"],bins=30,kde=True)
plt.title("AGE DISTRIBUTION")
plt.xlabel("AGE")
plt.ylabel("COUNT")

plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=df)

In [None]:
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
Scaler = StandardScaler()
X_train_scaled = Scaler.fit_transform(X_train)
X_test_scaled = Scaler.transform(X_test)

In [None]:
df.shape

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier()
}


In [None]:
results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(y_pred)


In [None]:
for model, acc in results.items():
    print(f"{model} Accuracy: {acc:.4f}")


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [None]:
y_pred

In [None]:
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

In [None]:
report  = classification_report(y_test, y_pred)

In [None]:
print(report)

In [None]:
features_importance = pd.Series(model.coef_[0], index = X.columns)
features_importance = features_importance.abs().sort_values(ascending=False)
print(features_importance)

In [None]:
import pickle

In [None]:
pickle.dump(model, open('model.pkl', 'wb'))