In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.decomposition import PCA

# Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor


In [8]:
# Load classification dataset
loan_data = pd.read_csv('loan_data.csv')

# Display dataset info
print("Classification dataset shape:", loan_data.shape)
print("\nFirst few rows:")
print(loan_data.head())

# Check for missing values
print("\nMissing values in classification dataset:")
print(loan_data.isnull().sum())

# Convert categorical variables to numerical
loan_data_encoded = pd.get_dummies(loan_data, drop_first=True)

# Define features and target
X = loan_data_encoded.drop('loan_status', axis=1)
y = loan_data_encoded['loan_status']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set shape: {X_train.shape}, Test set shape: {X_test.shape}")


Classification dataset shape: (45000, 14)

First few rows:
   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0      

In [None]:
# Table 1: Original data with 6 classifiers
print("\nTable 1: Classification - Original Data Metrics")
classification_table1 = {}

for name, clf in classifiers.items():
    metrics = evaluate_classifier(clf, X_train, X_test, y_train, y_test)
    classification_table1[name] = metrics

# Convert to DataFrame
classification_table1_df = pd.DataFrame(classification_table1).T
print(classification_table1_df)



Table 1: Classification - Original Data Metrics


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Apply different scaling techniques
scaler_methods = {
    'L1 Normalization': Normalizer(norm='l1'),
    'L2 Normalization': Normalizer(norm='l2'),
    'Min-Max Scaling': MinMaxScaler(),
    'Standard Scaling': StandardScaler()
}

print("\nTable 2: Classification - After Scaling Metrics")
classification_table2 = {}

for scaler_name, scaler in scaler_methods.items():
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    for clf_name, clf in classifiers.items():
        metrics = evaluate_classifier(clf, X_train_scaled, X_test_scaled, y_train, y_test)
        model_scaler = f"{clf_name} with {scaler_name}"
        classification_table2[model_scaler] = metrics

# Convert to DataFrame
classification_table2_df = pd.DataFrame(classification_table2).T
print(classification_table2_df)


In [None]:
# Define reduced parameter grids for GridSearchCV (fewer combinations for faster execution)
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1],  # Reduced options
        'solver': ['liblinear']  # Just one solver
    },
    'SVM': {
        'C': [1],  # Single most common value
        'kernel': ['linear']  # Linear kernel is faster than RBF
    },
    'Random Forest': {
        'n_estimators': [50],  # Reduced options
        'max_depth': [10]  # Single value
    },
    'Gradient Boosting': {
        'n_estimators': [50],  # Fewer trees
        'learning_rate': [0.1]  # Higher learning rate converges faster
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [5],  # Only middle value
        'weights': ['uniform']  # Just one option
    },
    'Decision Tree': {
        'max_depth': [10],  # Single value
        'min_samples_split': [2]  # Just one option
    }
}

# Function for GridSearchCV with 10-fold CV but with performance optimizations
def perform_grid_search(clf, param_grid, X, y):
    # Reduce dataset size for faster processing
    if len(X) > 10000:  # Only sample if dataset is large
        from sklearn.model_selection import train_test_split
        X_sample, _, y_sample, _ = train_test_split(X, y,
                                                  train_size=0.3,
                                                  random_state=42,
                                                  stratify=y)
    else:
        X_sample, y_sample = X, y

    # Keep 10-fold CV as requested but add parallel processing
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    grid_search = GridSearchCV(clf, param_grid, cv=cv, scoring='accuracy',
                              n_jobs=-1,  # Use all CPU cores
                              verbose=1)  # Show progress

    print(f"Starting GridSearchCV on {len(X_sample)} samples")
    grid_search.fit(X_sample, y_sample)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")

    # Fit the best model on the full training data
    if len(X) > 10000:
        best_clf = grid_search.best_estimator_
        best_clf.fit(X, y)
        return best_clf
    else:
        return grid_search.best_estimator_

print("\nTable 3: Classification - After GridSearchCV and 10-fold Cross-Validation")
classification_table3 = {}

# Add import for time tracking
import time
start_time = time.time()

for clf_name, clf in classifiers.items():
    print(f"\nPerforming GridSearchCV for {clf_name}...")
    clf_start = time.time()
    best_clf = perform_grid_search(clf, param_grids[clf_name], X_train, y_train)
    metrics = evaluate_classifier(best_clf, X_train, X_test, y_train, y_test)
    classification_table3[clf_name] = metrics
    print(f"Completed {clf_name} in {time.time() - clf_start:.2f} seconds")

print(f"\nTotal GridSearchCV time: {time.time() - start_time:.2f} seconds")

# Convert to DataFrame
classification_table3_df = pd.DataFrame(classification_table3).T
print("\nTable 3: Classification Results After Optimization")
print(classification_table3_df)


In [None]:
# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

print(f"\nNumber of components after PCA: {X_train_pca.shape[1]}")

print("\nTable 4: Classification - After PCA")
classification_table4 = {}

for name, clf in classifiers.items():
    metrics = evaluate_classifier(clf, X_train_pca, X_test_pca, y_train, y_test)
    classification_table4[name] = metrics

# Convert to DataFrame
classification_table4_df = pd.DataFrame(classification_table4).T
print(classification_table4_df)


In [None]:
# Load regression dataset
real_estate = pd.read_csv('Real estate.csv')

# Display dataset info
print("\nRegression dataset shape:", real_estate.shape)
print("\nFirst few rows:")
print(real_estate.head())

# Check for missing values
print("\nMissing values in regression dataset:")
print(real_estate.isnull().sum())

# Define features and target
X_reg = real_estate.drop(['No', 'Y house price of unit area'], axis=1)
y_reg = real_estate['Y house price of unit area']

# Split data
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
print(f"\nTraining set shape: {X_reg_train.shape}, Test set shape: {X_reg_test.shape}")


In [None]:
# Define regressors
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'SVR': SVR()
}

# Function to evaluate regressors
def evaluate_regressor(reg, X_train, X_test, y_train, y_test):
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)

    return {
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R² Score': r2_score(y_test, y_pred)
    }


In [None]:
# Table 1: Original data with 6 regressors
print("\nTable 1: Regression - Original Data Metrics")
regression_table1 = {}

for name, reg in regressors.items():
    metrics = evaluate_regressor(reg, X_reg_train, X_reg_test, y_reg_train, y_reg_test)
    regression_table1[name] = metrics

# Convert to DataFrame
regression_table1_df = pd.DataFrame(regression_table1).T
print(regression_table1_df)


In [None]:
# Define simplified parameter grids for regressors (fewer combinations for faster execution)
reg_param_grids = {
    'Linear Regression': {
        'fit_intercept': [True]  # Reduced to just default option
    },
    'Ridge': {
        'alpha': [0.1, 1],  # Reduced options
        'solver': ['auto']  # Only the default solver
    },
    'Lasso': {
        'alpha': [0.1, 1],  # Reduced options
        'selection': ['cyclic']  # Only the default option
    },
    'Random Forest': {
        'n_estimators': [50],  # Single value
        'max_depth': [10]  # Single value
    },
    'Gradient Boosting': {
        'n_estimators': [50],  # Fewer trees
        'learning_rate': [0.1]  # Higher learning rate
    },
    'SVR': {
        'C': [1],  # Single value
        'kernel': ['linear']  # Linear is faster than RBF
    }
}

# Optimized function for GridSearchCV with 10-fold CV for regression
def perform_grid_search_reg(reg, param_grid, X, y):
    # Sample data if it's large to speed up processing
    if len(X) > 1000:  # Threshold for sampling
        from sklearn.model_selection import train_test_split
        X_sample, _, y_sample, _ = train_test_split(X, y,
                                                   train_size=0.3,
                                                   random_state=42,
                                                   stratify=None)
        print(f"Sampled {len(X_sample)} from {len(X)} records for faster processing")
    else:
        X_sample, y_sample = X, y

    # Keep 10-fold CV as requested but add performance optimizations
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    grid_search = GridSearchCV(reg, param_grid, cv=cv,
                              scoring='neg_mean_squared_error',
                              n_jobs=-1,  # Use all CPU cores
                              verbose=1)  # Show progress

    # Track execution time
    import time
    start_time = time.time()

    grid_search.fit(X_sample, y_sample)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best MSE: {-grid_search.best_score_:.4f}")
    print(f"GridSearch completed in {time.time() - start_time:.2f} seconds")

    # Refit the best model on the full dataset if we sampled
    if len(X) > 1000:
        print("Refitting best model on full training data...")
        best_reg = grid_search.best_estimator_
        best_reg.fit(X, y)
        return best_reg
    else:
        return grid_search.best_estimator_

print("\nTable 2: Regression - After GridSearchCV and 10-fold Cross-Validation")
regression_table2 = {}

# Track total execution time
import time
total_start_time = time.time()

for reg_name, reg in regressors.items():
    print(f"\nPerforming GridSearchCV for {reg_name}...")
    best_reg = perform_grid_search_reg(reg, reg_param_grids[reg_name], X_reg_train, y_reg_train)
    metrics = evaluate_regressor(best_reg, X_reg_train, X_reg_test, y_reg_train, y_reg_test)
    regression_table2[reg_name] = metrics

print(f"\nTotal execution time: {time.time() - total_start_time:.2f} seconds")

# Convert to DataFrame
regression_table2_df = pd.DataFrame(regression_table2).T
print("\nTable 2: Regression Results After Optimization")
print(regression_table2_df)


In [None]:
# Plot classification results
plt.figure(figsize=(12, 6))
classification_table1_df['Accuracy'].plot(kind='bar', color='skyblue')
plt.title('Classification Models Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Plot regression results
plt.figure(figsize=(12, 6))
regression_table1_df['R² Score'].plot(kind='bar', color='lightgreen')
plt.title('Regression Models R² Score')
plt.ylabel('R² Score')
plt.xlabel('Model')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
