IMPORTS AND INITIAL FUNCTIONS

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance
import joblib

# Define a function to load and preprocess the data
def load_and_preprocess_data(file_path):
    """
    Load data from file and perform initial preprocessing.
    
    Args:
        file_path: Path to the data file
        
    Returns:
        Preprocessed DataFrame
    """
    # Load data
    print("Loading data...")
    df = pd.read_csv(r"C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json")
    
    # Display basic information
    print(f"Data shape: {df.shape}")
    print("\nData types:")
    print(df.dtypes)
    
    # Check for missing values
    missing_values = df.isnull().sum()
    if missing_values.sum() > 0:
        print("\nMissing values:")
        print(missing_values[missing_values > 0])
        
        # Fill missing values (customize this based on your data)
        # Numeric columns: fill with median
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        for col in numeric_cols:
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(df[col].median())
                
        # Categorical columns: fill with mode
        cat_cols = df.select_dtypes(include=['object', 'category']).columns
        for col in cat_cols:
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(df[col].mode()[0])
    
    # Handle outliers in numeric columns (Optional)
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        # Cap outliers instead of removing them
        df[col] = df[col].clip(lower_bound, upper_bound)
    
    print("\nPreprocessing complete!")
    return df

Section 2: Class Balancing and Feature Engineering

In [2]:
# Define a function to handle class imbalance
def balance_classes(df, target_column='assignment_valid'):
    """
    Handle class imbalance using upsampling.
    
    Args:
        df: Input DataFrame
        target_column: Name of the target column
        
    Returns:
        Balanced DataFrame
    """
    print("Balancing classes...")
    # Check class distribution
    class_counts = df[target_column].value_counts()
    print(f"Original class distribution:\n{class_counts}")
    
    # Split into majority and minority
    df_majority = df[df[target_column] == 0]
    df_minority = df[df[target_column] == 1]
    
    # Upsample the minority class
    df_minority_upsampled = resample(
        df_minority,
        replace=True,                      # Allow duplicates
        n_samples=len(df_majority),        # Make it same size as majority
        random_state=42
    )
    
    # Combine both
    df_balanced = pd.concat([df_majority, df_minority_upsampled])
    
    # Shuffle so they're mixed well
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Check new class distribution
    balanced_class_counts = df_balanced[target_column].value_counts()
    print(f"Balanced class distribution:\n{balanced_class_counts}")
    
    return df_balanced

# Feature engineering function
def engineer_features(df):
    """
    Create new features to improve model performance.
    
    Args:
        df: Input DataFrame
        
    Returns:
        DataFrame with new features
    """
    print("Engineering features...")
    df_new = df.copy()
    
    # Example: Create time-based features if timestamps exist
    # Uncomment and modify based on your dataset structure
    """
    if 'timestamp' in df_new.columns:
        df_new['timestamp'] = pd.to_datetime(df_new['timestamp'])
        df_new['hour_of_day'] = df_new['timestamp'].dt.hour
        df_new['day_of_week'] = df_new['timestamp'].dt.dayofweek
        df_new['is_weekend'] = df_new['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    """
    
    # Example: Create interaction features
    # Uncomment and modify based on your dataset structure
    """
    if 'experience_years' in df_new.columns and 'project_count' in df_new.columns:
        df_new['exp_per_project'] = df_new['experience_years'] / (df_new['project_count'] + 1)
    """
    
    # Add your custom feature engineering code here based on your data
    
    return df_new

Section 3: Model Building and Training

In [3]:
# Build and train the model
def build_and_train_model(df, target_column='assignment_valid', test_size=0.2, tune_hyperparams=True):
    """
    Build, train and evaluate the random forest model.
    
    Args:
        df: Preprocessed DataFrame
        target_column: Name of the target column
        test_size: Proportion of data to use for testing
        tune_hyperparams: Whether to perform hyperparameter tuning
        
    Returns:
        Trained model, feature names, and evaluation metrics
    """
    print("Building and training model...")
    
    # Split features and target
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    
    # Identify numeric and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    
    print(f"Numeric features: {len(numeric_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ],
        remainder='passthrough'
    )
    
    # Create train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    
    # Initial Random Forest settings
    base_rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
        bootstrap=True,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    
    # Create pipeline
    rf_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', base_rf)
    ])
    
    # Train the model - with or without hyperparameter tuning
    if tune_hyperparams:
        print("Performing hyperparameter tuning (this may take a while)...")
        param_grid = {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_features': ['sqrt', 'log2']
        }
        
        grid_search = GridSearchCV(
            rf_pipeline,
            param_grid=param_grid,
            cv=5,
            scoring='f1',
            n_jobs=-1,
            verbose=1
        )
        
        grid_search.fit(X_train, y_train)
        print(f"Best parameters: {grid_search.best_params_}")
        
        # Use the best model
        best_model = grid_search.best_estimator_
    else:
        print("Training model without hyperparameter tuning...")
        rf_pipeline.fit(X_train, y_train)
        best_model = rf_pipeline
    
    # Get feature names after one-hot encoding
    feature_names = []
    
    # Add numeric feature names directly
    feature_names.extend(numeric_features.tolist())
    
    # Get one-hot encoded feature names if there are categorical features
    if len(categorical_features) > 0:
        ohe = best_model.named_steps['preprocessor'].named_transformers_['cat']
        cat_feature_names = ohe.get_feature_names_out(categorical_features)
        feature_names.extend(cat_feature_names.tolist())
    
    # Evaluate the model
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    print("\nModel Evaluation:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Calculate AUC-ROC
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    print(f"\nAUC-ROC Score: {auc_roc:.4f}")
    
    # Create evaluation dictionary for later reference
    evaluation = {
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'classification_report': classification_report(y_test, y_pred, output_dict=True),
        'auc_roc': auc_roc,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    return best_model, feature_names, evaluation

Section 4: Feature Importance Analysis

In [4]:
# Function to analyze feature importance
def analyze_feature_importance(model, feature_names, X_test, y_test, top_n=20):
    """
    Analyze and visualize feature importance.
    
    Args:
        model: Trained model
        feature_names: List of feature names
        X_test: Test features
        y_test: Test target
        top_n: Number of top features to display
        
    Returns:
        DataFrame with feature importances
    """
    print(f"\nAnalyzing top {top_n} feature importances...")
    
    # Extract the random forest classifier from the pipeline
    rf_classifier = model.named_steps['classifier']
    
    # Get feature importances from the model
    importances = rf_classifier.feature_importances_
    
    # Create a DataFrame for feature importances
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    
    # Sort by importance
    feature_importance_df = feature_importance_df.sort_values(
        by='Importance', ascending=False
    ).reset_index(drop=True)
    
    # Display top features
    print("\nTop features by importance:")
    print(feature_importance_df.head(top_n))
    
    # Calculate permutation importance (more reliable)
    print("\nCalculating permutation importance (this may take a while)...")
    preprocessed_X_test = model.named_steps['preprocessor'].transform(X_test)
    perm_importance = permutation_importance(
        rf_classifier, preprocessed_X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1
    )
    
    # Create a DataFrame for permutation importances
    perm_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Permutation_Importance_Mean': perm_importance.importances_mean,
        'Permutation_Importance_Std': perm_importance.importances_std
    })
    
    # Sort by importance
    perm_importance_df = perm_importance_df.sort_values(
        by='Permutation_Importance_Mean', ascending=False
    ).reset_index(drop=True)
    
    # Display top features by permutation importance
    print("\nTop features by permutation importance:")
    print(perm_importance_df.head(top_n))
    
    # Plot feature importances
    plt.figure(figsize=(12, 8))
    sns.barplot(
        x='Importance', y='Feature', 
        data=feature_importance_df.head(top_n),
        palette='viridis'
    )
    plt.title('Top Features by Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.show()
    
    # Plot permutation importances
    plt.figure(figsize=(12, 8))
    sns.barplot(
        x='Permutation_Importance_Mean', y='Feature', 
        data=perm_importance_df.head(top_n),
        palette='viridis'
    )
    plt.title('Top Features by Permutation Importance')
    plt.tight_layout()
    plt.savefig('permutation_importance.png')
    plt.show()
    
    # Return both importance DataFrames
    return feature_importance_df, perm_importance_df

Section 5: Recommendation and Model Persistence Functions

In [5]:
# Function to recommend people based on availability
def recommend_available_people(new_data, model, person_id_column=None, top_n=5):
    """
    Recommend top N people based on predicted availability.
    
    Args:
        new_data: DataFrame with features (same as training data)
        model: Trained random forest pipeline
        person_id_column: Column name containing person identifiers
        top_n: Number of top recommendations to return
    
    Returns:
        DataFrame with top N recommended people and probability scores
    """
    print(f"Generating top {top_n} recommendations...")
    
    # Make a copy of the input data
    data_copy = new_data.copy()
    
    # Extract person IDs if specified
    person_ids = None
    if person_id_column:
        person_ids = data_copy[person_id_column].values
        data_copy = data_copy.drop(person_id_column, axis=1)
    else:
        # Use index as person ID
        person_ids = data_copy.index.values
    
    # Get probability of class 1 (available)
    proba = model.predict_proba(data_copy)[:, 1]
    
    # Create results dataframe with ID and probability
    results = pd.DataFrame({
        'person_id': person_ids,
        'availability_score': proba
    })
    
    # Sort by probability descending and take top N
    recommendations = results.sort_values(
        by='availability_score', 
        ascending=False
    ).head(top_n)
    
    print("Recommendations generated!")
    return recommendations

# Function to save the model and related artifacts
def save_model(model, feature_names, output_path='random_forest_availability_model.joblib'):
    """
    Save the trained model and related artifacts.
    
    Args:
        model: Trained model
        feature_names: List of feature names
        output_path: Path to save the model
    """
    print(f"Saving model to {output_path}...")
    
    model_artifacts = {
        'model': model,
        'feature_names': feature_names,
        'creation_date': pd.Timestamp.now(),
        'version': '1.0'
    }
    
    joblib.dump(model_artifacts, output_path)
    print("Model saved successfully!")

# Function to load the saved model
def load_model(model_path='random_forest_availability_model.joblib'):
    """
    Load a saved model.
    
    Args:
        model_path: Path to the saved model
        
    Returns:
        Loaded model and feature names
    """
    print(f"Loading model from {model_path}...")
    model_artifacts = joblib.load(model_path)
    
    model = model_artifacts['model']
    feature_names = model_artifacts['feature_names']
    
    print(f"Model loaded successfully! Version: {model_artifacts['version']}")
    return model, feature_names

Section 6: Main Function and Example Usage

In [6]:
# Main function to run the entire pipeline
def main(file_path, target_column='assignment_valid', tune_hyperparams=True, save_model_path=None):
    """
    Run the entire pipeline from data loading to model evaluation.
    
    Args:
        file_path: Path to the data file
        target_column: Name of the target column
        tune_hyperparams: Whether to perform hyperparameter tuning
        save_model_path: Path to save the model (if None, model will not be saved)
        
    Returns:
        Trained model, feature names, and evaluation metrics
    """
    # 1. Load and preprocess data
    df = load_and_preprocess_data(file_path)
    
    # 2. Engineer features
    df_engineered = engineer_features(df)
    
    # 3. Balance classes
    df_balanced = balance_classes(df_engineered, target_column)
    
    # 4. Build and train model
    model, feature_names, evaluation = build_and_train_model(
        df_balanced, target_column, test_size=0.2, tune_hyperparams=tune_hyperparams
    )
    
    # 5. Analyze feature importance
    feature_importance_df, perm_importance_df = analyze_feature_importance(
        model, feature_names, evaluation['X_test'], evaluation['y_test']
    )
    
    # 6. Save model if specified
    if save_model_path:
        save_model(model, feature_names, save_model_path)
        
    print("\nPipeline completed successfully!")
    return model, feature_names, evaluation

# Example usage - modify this with your actual file path and options
file_path = r"C:\Users\ashwi\Downloads\sample_test_data.json"  # Replace with your actual data file path

# Run the pipeline (uncomment when ready to run)
# model, feature_names, evaluation = main(
#     file_path=file_path,
#     target_column='assignment_valid',  # Change to your target column name if different
#     tune_hyperparams=True,  # Set to False for faster results but potentially lower performance
#     save_model_path='random_forest_availability_model.joblib'
# )

# Make recommendations (example)
# new_data = pd.read_csv("new_people_data.csv")  # Replace with your new data file
# recommendations = recommend_available_people(new_data, model, person_id_column='employee_id', top_n=10)
# print("\nTop recommendations:")
# print(recommendations)

In [34]:
import pandas as pd

# Load from a JSON file
df = pd.read_json(r"C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json")

# If it's nested or has a specific structure, try:
# df = pd.read_json("your_dataset.json", orient="records")  # or "index", "split", etc. depending on format

print(df.head())  # 👈 Just to check if it's loading properly


  employee_id          employee_skills  employee_availability task_id  \
0         E51  [db, frontend, backend]     [0, 1, 2, 4, 5, 6]    T159   
1         E30     [frontend, ml, java]  [0, 1, 2, 3, 4, 5, 6]    T132   
2         E16  [java, backend, db, ui]              [1, 4, 6]     T33   
3         E38                 [python]        [0, 3, 4, 5, 6]    T188   
4         E19            [backend, db]           [0, 2, 3, 5]      T9   

  task_required_skills task_priority  task_duration_days  task_start_day  \
0               [java]        medium                   5               2   
1       [java, ui, ml]           low                   5               4   
2             [python]          high                   1               4   
3             [python]        medium                   1               5   
4               [java]           low                   4               3   

   rule_violated  assignment_valid  
0           True                 0  
1           True              

In [35]:
import pandas as pd

df = pd.read_json(r"C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json")  # Replace with your actual path
print(df.head())  # Confirm it's loading properly


  employee_id          employee_skills  employee_availability task_id  \
0         E51  [db, frontend, backend]     [0, 1, 2, 4, 5, 6]    T159   
1         E30     [frontend, ml, java]  [0, 1, 2, 3, 4, 5, 6]    T132   
2         E16  [java, backend, db, ui]              [1, 4, 6]     T33   
3         E38                 [python]        [0, 3, 4, 5, 6]    T188   
4         E19            [backend, db]           [0, 2, 3, 5]      T9   

  task_required_skills task_priority  task_duration_days  task_start_day  \
0               [java]        medium                   5               2   
1       [java, ui, ml]           low                   5               4   
2             [python]          high                   1               4   
3             [python]        medium                   1               5   
4               [java]           low                   4               3   

   rule_violated  assignment_valid  
0           True                 0  
1           True              

In [22]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

def preprocess_dataframe(df, fit_encoders=True, encoders=None):
    df = df.copy()
    
    if encoders is None:
        encoders = {
            "skills": MultiLabelBinarizer(),
            "availability": MultiLabelBinarizer(),
            "task_skills": MultiLabelBinarizer(),
        }

    if fit_encoders:
        skills_encoded = encoders["skills"].fit_transform(df['employee_skills'])
        availability_encoded = encoders["availability"].fit_transform(df['employee_availability'])
        task_skills_encoded = encoders["task_skills"].fit_transform(df['task_required_skills'])
    else:
        skills_encoded = encoders["skills"].transform(df['employee_skills'])
        availability_encoded = encoders["availability"].transform(df['employee_availability'])
        task_skills_encoded = encoders["task_skills"].transform(df['task_required_skills'])

    # Create DataFrames
    skills_df = pd.DataFrame(skills_encoded, columns=[f"skill_{s}" for s in encoders["skills"].classes_])
    availability_df = pd.DataFrame(availability_encoded, columns=[f"avail_day_{d}" for d in encoders["availability"].classes_])
    task_skills_df = pd.DataFrame(task_skills_encoded, columns=[f"task_req_{s}" for s in encoders["task_skills"].classes_])

    # Drop original and unused columns
    drop_cols = ['employee_skills', 'employee_availability', 'task_required_skills', 'employee_id', 'task_id']
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])

    # Concatenate
    df_final = pd.concat([df.reset_index(drop=True), skills_df, availability_df, task_skills_df], axis=1)

    return df_final, encoders


In [52]:
import pandas as pd
import numpy as np
import json
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin

# 🔧 Set label column
TARGET_COLUMN = 'assignment_valid'

# 🚀 Load your JSON dataset
df = pd.read_json(r"C:\Users\ashwi\OneDrive\Desktop\Sycamore\Hackathons\HackToFuture sjec\ML part\scheduling_dataset.json")

print("🔍 Column types BEFORE processing:")
print(df.dtypes)

# 🧼 Custom transformer for multi-hot encoding list fields
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = MultiLabelBinarizer()
        self.column_name = None

    def fit(self, X, y=None):
        self.column_name = X.columns[0]
        self.encoder.fit(X[self.column_name].str.split(','))
        return self

    def transform(self, X):
        col = self.column_name
        values = X[col].str.split(',').fillna('')
        transformed = self.encoder.transform(values)
        new_cols = [f"{col}__{c}" for c in self.encoder.classes_]
        return pd.DataFrame(transformed, columns=new_cols, index=X.index)


# ✅ Robust preprocessing function
def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    list_columns = ['employee_skills', 'employee_availability', 'task_required_skills']
    for col in list_columns:
        if col in df.columns:
            def to_string_list(x):
                if isinstance(x, list):
                    return ','.join(map(str, x))
                elif isinstance(x, str):
                    try:
                        # Attempt to parse stringified list
                        parsed = json.loads(x.replace("'", '"'))
                        if isinstance(parsed, list):
                            return ','.join(map(str, parsed))
                    except:
                        pass
                return ''
            df[col] = df[col].apply(to_string_list)
    return df

# 🏗️ Build and train model
def build_and_train_model(df_clean, label_column=TARGET_COLUMN):
    print("\n📊 Dataset shape:", df_clean.shape)

    if label_column not in df_clean.columns:
        raise ValueError(f"❌ Label column '{label_column}' not found. Available: {df_clean.columns.tolist()}")

    X = df_clean.drop(columns=[label_column])
    y = df_clean[label_column]

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # Identify columns
    list_cols = ['employee_skills', 'employee_availability', 'task_required_skills']
    # FIXED: Moved task_priority to categorical columns
    categorical_cols = ['employee_id', 'task_id', 'rule_violated', 'task_priority']
    # FIXED: Removed task_priority from numeric columns
    numeric_cols = ['task_duration_days', 'task_start_day']

    print(f"\n🔍 Categorical Columns: {categorical_cols}")
    print(f"🔢 Numeric Columns: {numeric_cols}")
    print(f"🎯 List Columns (Multi-hot): {list_cols}")

    # Column transformer
    preprocessor = ColumnTransformer(transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('skills', MultiHotEncoder(), ['employee_skills']),
        ('availability', MultiHotEncoder(), ['employee_availability']),
        ('required_skills', MultiHotEncoder(), ['task_required_skills']),
    ])

    # Pipeline
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ])

    # GridSearch
    param_grid = {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [None, 10],
        'classifier__min_samples_split': [2, 5],
        'classifier__max_features': ['sqrt']
    }

    grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("✅ Best parameters:", grid.best_params_)

    # Evaluation
    y_pred = grid.predict(X_test)
    print("\n📋 Classification Report:\n", classification_report(y_test, y_pred))

    return grid.best_estimator_, X.columns.tolist()

# 🧹 Clean the data
df_clean = preprocess_dataframe(df)

# 🧠 Train the model
model, used_features = build_and_train_model(df_clean, label_column=TARGET_COLUMN)

# 💾 Save model
joblib.dump(model, 'smart_scheduler_model.pkl')
print("✅ Model saved as 'smart_scheduler_model.pkl'")

🔍 Column types BEFORE processing:
employee_id              object
employee_skills          object
employee_availability    object
task_id                  object
task_required_skills     object
task_priority            object
task_duration_days        int64
task_start_day            int64
rule_violated              bool
assignment_valid          int64
dtype: object

📊 Dataset shape: (500, 10)

🔍 Categorical Columns: ['employee_id', 'task_id', 'rule_violated', 'task_priority']
🔢 Numeric Columns: ['task_duration_days', 'task_start_day']
🎯 List Columns (Multi-hot): ['employee_skills', 'employee_availability', 'task_required_skills']
Fitting 3 folds for each of 8 candidates, totalling 24 fits
✅ Best parameters: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}

📋 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        95
          

In [24]:
import joblib
joblib.dump(model, "trained_model.pkl")


['trained_model.pkl']

In [26]:
import json

# Save feature names used during training
with open("features.json", "w") as f:
    json.dump(features, f)


In [31]:
import pandas as pd
import random

# Sample values
skills = ["python", "sql", "java", "ml", "excel", "communication", "db", "frontend", "backend", "ui"]
weekdays = list(range(7))  # 0 = Monday, 6 = Sunday
priorities = ["low", "medium", "high"]

data = {
    "employee_id": [f"emp_{i}" for i in range(1, 21)],
    "employee_skills": [random.sample(skills, k=random.randint(1, 4)) for _ in range(20)],
    "employee_availability": [random.sample(weekdays, k=random.randint(2, 5)) for _ in range(20)],
    "task_id": [f"task_{i}" for i in range(1, 21)],
    "task_required_skills": [random.sample(skills, k=random.randint(1, 2)) for _ in range(20)],
    "task_priority": random.choices(priorities, k=20),
    "task_duration_days": [random.randint(1, 7) for _ in range(20)],
    "task_start_day": [random.choice(weekdays) for _ in range(20)],
    "rule_violated": random.choices([True, False], k=20),
    "assignment_valid": random.choices([0, 1], k=20),
}

df = pd.DataFrame(data)
df.to_csv("sample_scheduling_dataset.csv", index=False)
print("✅ CSV file 'sample_scheduling_dataset.csv' generated successfully.")


✅ CSV file 'sample_scheduling_dataset.csv' generated successfully.
