In [None]:
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix


file_path = 'cleanLoanData.csv'
data = pd.read_csv(file_path)

# Features
features = [
    'Married.Single', 'House_Ownership', 'Car_Ownership', 
    'Profession', 'STATE', 'Income_Level', 'Age_Group'
]
target = 'Risk_Flag'


categorical_features = ['Profession', 'STATE', 'Income_Level', 'Age_Group']
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)


if target not in data.columns:
    raise ValueError(f"The target column '{target}' does not exist in the dataset.")


X = data.drop(columns=[target])
y = data[target]

# Traning set 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


y_pred_prob = rf_model.predict_proba(X_test)[:, 1]
y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

#confusion matrix
#conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
#plt.figure(figsize=(8, 6))
#sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Risk", "Risk"], yticklabels=["No Risk", "Risk"])
#plt.xlabel('Predicted')
#plt.ylabel('Actual')
#plt.title('Confusion Matrix')
#plt.show()


feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_model.feature_importances_
})

# Combine one-hot encoded features
feature_importances['Category'] = feature_importances['Feature'].apply(
    lambda x: 'Profession' if 'Profession' in x else 
              ('STATE' if 'STATE' in x else 
               ('Income_Level' if 'Income_Level' in x else 
                ('Age_Group' if 'Age_Group' in x else x)))
)

# Filter to include only the specified columns
filtered_features = ['Married.Single', 'House_Ownership', 'Car_Ownership', 
                     'Profession', 'STATE', 'Income_Level', 'Age_Group']
aggregated_importances = feature_importances[
    feature_importances['Category'].isin(filtered_features)
].groupby('Category')['Importance'].sum().reset_index()

# Sort aggregated feature importance
aggregated_importances = aggregated_importances.sort_values(by='Importance', ascending=False)

# Plot aggregated feature importance
plt.figure(figsize=(10, 6))
plt.barh(aggregated_importances['Category'], aggregated_importances['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Feature Category')
plt.title('Aggregated Feature Importance Plot')
plt.gca().invert_yaxis()
plt.grid(axis='x')
plt.show()

# Plot the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()

# Display outputs
print("Filtered Aggregated Feature Importances:")
print(aggregated_importances)
print("\nConfusion Matrix:")
#print(conf_matrix)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_output)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve
#import seaborn as sns


data_path = 'cleanLoanData.csv'  # Adjust this to the dataset's location
data = pd.read_csv(data_path)

#Filter relevant columns
selected_features = [
      'Married.Single', 'House_Ownership',
    'Car_Ownership', 'Profession', 'STATE',
    'Risk_Flag', 'Income_Level', 'Age_Group', 
]
data = data[selected_features]


categorical_columns = data.select_dtypes(include=['object']).columns


label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])


X = data.drop('Risk_Flag', axis=1).values  # Features
y = data['Risk_Flag'].values  # Target

# Perform 70/30 split for training 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

s
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Train SVM 
svm_model = SVC(kernel='linear', random_state=42, probability=True)
svm_model.fit(X_train_scaled, y_train)

#
# ROC Curve
roc_auc = roc_auc_score(y_test, svm_model.decision_function(X_test_scaled))
print(f"ROC AUC Score: {roc_auc}")

fpr, tpr, thresholds = roc_curve(y_test, svm_model.decision_function(X_test_scaled))
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend()
plt.grid()
plt.show()
y_test_pred = svm_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Data Accuracy:", test_accuracy)
print("\nConfusion Matrix (Testing Data):")
print(confusion_matrix(y_test, y_test_pred))
print("\nClassification Report (Testing Data):")
print(classification_report(y_test, y_test_pred))

# Step 9: Generate Confusion Matrix done locally
#cm = confusion_matrix(y_test, y_test_pred)
#plt.figure()
#sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Risky', 'Risky'], yticklabels=['Non-Risky', 'Risky'])
#plt.xlabel('Predicted')
#plt.ylabel('Actual')
#plt.title('Confusion Matrix')
#plt.show()

# Step 10: Generate and plot the ROC Curve
roc_auc = roc_auc_score(y_test, svm_model.decision_function(X_test_scaled))
print(f"ROC AUC Score: {roc_auc}")

fpr, tpr, thresholds = roc_curve(y_test, svm_model.decision_function(X_test_scaled))
plt.figure()
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend()
plt.grid()
plt.show()


feature_importance = svm_model.coef_[0]
feature_names = data.drop('Risk_Flag', axis=1).columns

#feature importance
for name, importance in zip(feature_names, feature_importance):
    print(f"Feature: {name}, Importance: {importance:.4f}")

# Plot feature importance
plt.figure()
plt.barh(feature_names, feature_importance)
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.title('Feature Importance via SVM Coefficients')
plt.show()
