In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the training and testing datasets
train_data = pd.read_csv('/Users/abdulrehuman/Documents/Data Science/muhsina/train.csv')
test_data = pd.read_csv('/Users/abdulrehuman/Documents/Data Science/muhsina/test.csv')

# Exploratory Data Analysis
print(train_data.info())
print(train_data.describe())
print(train_data.head())

# Pre-processing

# Handle missing values
train_data = train_data.drop('Loan_ID', axis=1)  # Remove Loan_ID as it's not useful for modeling
test_loan_ids = test_data['Loan_ID']  # Save Loan_IDs for the final prediction file
test_data = test_data.drop('Loan_ID', axis=1)  # Remove Loan_ID from test_data

# Handle missing values in categorical columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
for col in categorical_cols:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

# Handle missing values in numerical columns
numerical_cols = ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']
for col in numerical_cols:
    imputer = SimpleImputer(strategy='mean')
    train_data[col] = imputer.fit_transform(train_data[[col]])
    test_data[col] = imputer.transform(test_data[[col]])

# Encode categorical variables
le = LabelEncoder()
for col in categorical_cols:
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])

# Modelling

# Split the training data into features (X) and target variable (y)
X = train_data.drop('Loan_Status', axis=1)
y = train_data['Loan_Status']

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to try
models = [
    RandomForestClassifier(random_state=42),
    #GradientBoostingClassifier(random_state=42)
]

best_model = None
best_accuracy = 0

# Try different models
for model in models:
    model.fit(X_train, y_train)
    valid_preds = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, valid_preds)

    print(f'Model: {model.__class__.__name__}, Accuracy: {accuracy}')

    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Save the best model
joblib.dump(best_model, '/Users/abdulrehuman/Documents/Data Science/muhsina/best_loanmodel.pkl')

# Make predictions on the test set using the best model
test_preds = best_model.predict(test_data)

# Create a new DataFrame with the predictions
output_df = pd.DataFrame({'Loan_ID': test_loan_ids, 'Loan_Status': test_preds})

# Save the DataFrame to a CSV file
output_df.to_csv('/Users/abdulrehuman/Documents/Data Science/muhsina/loan_predictions.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None
       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         6

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['N' 'Y']