In [52]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [53]:
# Load the data
train_data = pd.read_csv('Assignment_Train.csv')
test_data = pd.read_csv('Assignment_Test.csv')

In [54]:
# Function to preprocess data
def preprocess_data(data, is_train=True):
    # Convert 'APPLICATION LOGIN DATE' to datetime and extract year and month
    data['APPLICATION LOGIN DATE'] = pd.to_datetime(data['APPLICATION LOGIN DATE'], format='%m/%d/%y', errors='coerce')
    data['LOGIN_YEAR'] = data['APPLICATION LOGIN DATE'].dt.year
    data['LOGIN_MONTH'] = data['APPLICATION LOGIN DATE'].dt.month

    # Drop the 'APPLICATION LOGIN DATE' column as it can't be used directly by the model
    data = data.drop('APPLICATION LOGIN DATE', axis=1)

    # Convert Cibil Score to float, dealing with non-numeric values
    data['Cibil Score'] = pd.to_numeric(data['Cibil Score'], errors='coerce')

    # Drop columns with too many missing values or irrelevant columns
    columns_to_drop = [
        'FIRST NAME', 'MIDDLE NAME', 'LAST NAME', 'mobile', 'Personal Email Address',
        'Pan Name', 'name', 'vpa', 'upi_name',
        'Phone Social Premium.a23games', 'Phone Social Premium.my11',
        'Phone Social Premium.rummycircle', 'Phone Social Premium.yatra'
    ]

    # Drop columns that exist in the DataFrame
    data = data.drop(columns=[col for col in columns_to_drop if col in data.columns])

    return data


In [55]:
# Preprocess train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data, is_train=False)

In [56]:
# Identify numeric and categorical columns
numeric_columns = train_data.select_dtypes(include=[np.number]).columns
categorical_columns = train_data.select_dtypes(exclude=[np.number]).columns.drop('Application Status')

In [57]:
# Create preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [58]:
# Create a pipeline with preprocessor and random forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [59]:
# Prepare the data
X = train_data.drop('Application Status', axis=1)
y = train_data['Application Status']

In [60]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [61]:
# Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Make predictions on test data
test_predictions = rf_pipeline.predict(test_data)

Accuracy: 0.862
Classification Report:
               precision    recall  f1-score   support

    APPROVED       0.94      0.85      0.89      1995
    DECLINED       0.75      0.89      0.81      1005

    accuracy                           0.86      3000
   macro avg       0.84      0.87      0.85      3000
weighted avg       0.87      0.86      0.86      3000



In [62]:
# Create a DataFrame for predictions
output = pd.DataFrame({
    'UID': test_data['UID'],  # Use the correct column name from your test data
    'Prediction': test_predictions
})
# Save predictions to a CSV file
output.to_csv('predictions.csv', index=False)