In [None]:
import pandas as pd
import numpy as np

In [None]:
credit = pd.read_csv(r"C:\Users\chand\Documents\SQL data\Python Data\Default Credit data.csv")

In [None]:
credit.shape

In [None]:
pd.set_option('display.max_columns', None)



In [None]:
print(credit.select_dtypes(include=['object']).dtypes)

print(credit.select_dtypes(include=['number']).dtypes)


In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' removes redundant categories

encoded_columns = encoder.fit_transform(credit[['home_ownership', 'purpose']])

encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(['home_ownership', 'purpose']))

credit = pd.concat([credit.drop(columns=['home_ownership', 'purpose']), encoded_df], axis=1)

print(credit)


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
credit.drop(columns=['id', 'zip_code', 'addr_state'], inplace=True)

In [None]:
pre_scaled_data = credit.drop(columns=['loan_status'])

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pre_scaled_data)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import numpy as np

imputer = SimpleImputer(strategy='mean')
scaled_data_cleaned = imputer.fit_transform(scaled_data)

pca = PCA()
X_pca_pre = pca.fit_transform(scaled_data_cleaned)

cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(scaled_data_cleaned)

print(f"Number of components: {n_components}")
print(f"Shape of PCA-transformed data: {X_pca.shape}")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
credit = credit.dropna(subset=['loan_status'])

y = credit['loan_status']


In [None]:
scaled_data_cleaned = scaled_data_cleaned[credit.index]

In [None]:
X = X_pca  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print(classification_report(y_test, y_pred_log))

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("\nXGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Step 1: Reinitialize the models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Step 2: Create the Voting Classifier (Soft Voting)
ensemble = VotingClassifier(
    estimators=[('Logistic Regression', log_reg), 
                ('Random Forest', rf), 
                ('XGBoost', xgb)],
    voting='soft'  # Use 'soft' for probability-based voting
)

# Step 3: Train the Ensemble
ensemble.fit(X_train, y_train)

# Step 4: Make Predictions
y_pred_ensemble = ensemble.predict(X_test)

# Step 5: Evaluate the Ensemble
print("Ensemble Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ensemble):.4f}")
print(classification_report(y_test, y_pred_ensemble))
