In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the trained model
model = joblib.load('random_forest_model.pkl')

def preprocess_data(new_data):
    # Load the original training data to get preprocessing details
    data_url = "https://raw.githubusercontent.com/DwaipayanDutta/Assignment/main/Data%20Files/train.csv"
    train_data = pd.read_csv(data_url)
    
    # Identify categorical columns from the original training data
    categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
    
    # Drop columns that have all missing values in the new data
    new_data = new_data.dropna(axis=1, how='all')
    
    # Ensure that the new data has the same categorical columns as the training data
    for col in categorical_cols:
        if col not in new_data.columns:
            new_data[col] = np.nan  # Add missing categorical columns with NaN values
    
    # Create a preprocessing pipeline similar to the one used during training
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
            ('num', 'passthrough', new_data.select_dtypes(include=[np.number]).columns.tolist())  
        ],
        remainder='passthrough'  
    )
    
    # Fit and transform the new data using the preprocessor
    new_data_transformed = preprocessor.fit_transform(new_data)
    
    return new_data_transformed

def score_new_data(file_path):
    # Load new data for scoring
    new_data = pd.read_csv(file_path)
    
    # Preprocess the new data
    processed_data = preprocess_data(new_data)
    
    # Make predictions using the loaded model
    predictions = model.predict(processed_data)
    
    # Convert predictions to a DataFrame for better readability
    results_df = pd.DataFrame(predictions, columns=['Predicted Label'])
    
    return results_df

In [None]:
if __name__ == "__main__":
    file_path = 'new_data.csv'  # Path to your new dataset for scoring
    results = score_new_data(file_path)
    
    print("Predictions on New Data:")
    print(results)