In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the trained model
model = joblib.load('random_forest_model.pkl')

def preprocess_data(new_data):
    # Load the original training data to get preprocessing details
    data_url = "https://raw.githubusercontent.com/DwaipayanDutta/Assignment/main/Data%20Files/train.csv"
    train_data = pd.read_csv(data_url)
    categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
    new_data = new_data.dropna(axis=1, how='all')
    for col in categorical_cols:
        if col not in new_data.columns:
            new_data[col] = np.nan 
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', 'passthrough', new_data.select_dtypes(include=[np.number]).columns.tolist())  
        ],
        remainder='passthrough'  
    )
    new_data_transformed = preprocessor.fit_transform(new_data)
    
    return new_data_transformed

def score_new_data(file_path):
    new_data = pd.read_csv(file_path)
    processed_data = preprocess_data(new_data)
    predictions = model.predict(processed_data)
    results_df = pd.DataFrame(predictions, columns=['Predicted Label'])
    return results_df

In [None]:
if __name__ == "__main__":
    file_path = 'new_data.csv' 
    results = score_new_data(file_path)
    
    print("Predictions on New Data:")
    print(results)