<a href="https://colab.research.google.com/github/Badal3375/sklearn_learning-/blob/main/train_model_py_Model_Training_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import numpy as np

print("Model training script started.")

# 1. Load the dataset
try:
    df = pd.read_csv('/content/train_u6lujuX_CVtuZ9i.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: train.csv not found. Please download the dataset and place it in the correct directory.")
    exit()

# 2. Basic Data Preprocessing
print("Starting data preprocessing...")

# Convert categorical columns to numeric using one-hot encoding
df = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

# Fill missing values
# For simplicity, we'll use the median for numerical columns and the mode for categorical ones before encoding.
# Note: A more robust solution would involve more sophisticated imputation.
for col in ['LoanAmount', 'Loan_Amount_Term', 'Credit_History']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Drop rows with any remaining nulls (if any) and the Loan_ID column
df.dropna(inplace=True)
df.drop('Loan_ID', axis=1, inplace=True)

# Map target variable 'Loan_Status' to 0 and 1
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

print("Preprocessing complete.")

# 3. Define Features (X) and Target (y)
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Store the column names
model_columns = list(X.columns)
joblib.dump(model_columns, 'model_columns.pkl')
print(f"Model columns saved to model_columns.pkl. Columns are: {model_columns}")


# 4. Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train the Random Forest model
print("Training the Random Forest model...")
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# 6. Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# 7. Save the trained model
joblib.dump(model, 'loan_approval_model.pkl')
print("Model saved successfully as loan_approval_model.pkl.")

Model training script started.
Dataset loaded successfully.
Starting data preprocessing...
Preprocessing complete.
Model columns saved to model_columns.pkl. Columns are: ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Gender_Male', 'Married_Yes', 'Dependents_1', 'Dependents_2', 'Dependents_3+', 'Education_Not Graduate', 'Self_Employed_Yes', 'Property_Area_Semiurban', 'Property_Area_Urban']
Training the Random Forest model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Model training complete.
Model Accuracy: 78.86%
Model saved successfully as loan_approval_model.pkl.
