In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Load datasets
train_df = pd.read_csv("C:\\Users\\KAMAL\\Downloads\\archive\\Training Dataset.csv")
test_df = pd.read_csv("C:\\Users\\KAMAL\\Downloads\\archive\\Test Dataset.csv")
sample_submission_df = pd.read_csv("C:\\Users\\KAMAL\\Downloads\\archive\\Sample_Submission.csv")

# Preprocessing function
def preprocess_data(df):
    # Fill missing values for categorical columns with the mode
    categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Fill missing values for numerical columns with the mean
    numerical_cols = ['LoanAmount', 'Loan_Amount_Term', 'ApplicantIncome', 'CoapplicantIncome']
    for col in numerical_cols:
        df[col].fillna(df[col].mean(), inplace=True)

    # Encode categorical variables
    label_encoder = LabelEncoder()
    categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
    for col in categorical_cols:
        df[col] = label_encoder.fit_transform(df[col])

    return df

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Prepare training data
X = train_df.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_df['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate on validation set
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")

# Prepare test data for prediction
X_test = test_df.drop(columns=['Loan_ID'])
X_test = scaler.transform(X_test)

# Make predictions
test_predictions = model.predict(X_test)
test_predictions = ['Y' if pred == 1 else 'N' for pred in test_predictions]

# Create submission
submission_df = pd.DataFrame({'Loan_ID': test_df['Loan_ID'], 'Loan_Status': test_predictions})
submission_df.to_csv('submission.csv', index=False)
print("Submission file has been created.")

# Load and display submission
submission_data = pd.read_csv('submission.csv')
submission_data


Validation Accuracy: 0.77
Submission file has been created.


Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
...,...,...
362,LP002971,Y
363,LP002975,Y
364,LP002980,Y
365,LP002986,Y
