# AI-Powered Task Management System
## Week 3: Priority Prediction and Workload Balancing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the cleaned dataset
df = pd.read_csv('../data/cleaned_tasks.csv')
print("Dataset shape:", df.shape)
df.head()

## Feature Engineering for Priority Prediction

In [None]:
# Encode categorical variables
le_category = LabelEncoder()
le_assigned_to = LabelEncoder()
le_status = LabelEncoder()
le_priority = LabelEncoder()

df['category_encoded'] = le_category.fit_transform(df['category'])
df['assigned_to_encoded'] = le_assigned_to.fit_transform(df['assigned_to'])
df['status_encoded'] = le_status.fit_transform(df['status'])
df['priority_encoded'] = le_priority.fit_transform(df['priority'])

# TF-IDF features for descriptions
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))
tfidf_features = tfidf.fit_transform(df['processed_description'])

# Combine all features
numerical_features = df[['estimated_hours', 'category_encoded', 'assigned_to_encoded', 'status_encoded']].values
X_combined = np.hstack((tfidf_features.toarray(), numerical_features))
y_priority = df['priority_encoded']

print("Combined feature shape:", X_combined.shape)
print("Priority classes:", le_priority.classes_)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_priority, test_size=0.2, random_state=42, stratify=y_priority)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

## Random Forest for Priority Prediction

In [None]:
# Random Forest with GridSearchCV
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

print("Best Random Forest parameters:", rf_grid.best_params_)
print("Best Random Forest CV score:", rf_grid.best_score_)

In [None]:
# Evaluate Random Forest
rf_best = rf_grid.best_estimator_
rf_pred = rf_best.predict(X_test)

print("\nRandom Forest Test Results:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Precision:", precision_score(y_test, rf_pred, average='weighted'))
print("Recall:", recall_score(y_test, rf_pred, average='weighted'))
print("F1-Score:", f1_score(y_test, rf_pred, average='weighted'))

print("\nClassification Report:")
print(classification_report(y_test, rf_pred, target_names=le_priority.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, rf_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le_priority.classes_, yticklabels=le_priority.classes_)
plt.title('Confusion Matrix - Random Forest (Priority Prediction)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## XGBoost for Priority Prediction

In [None]:
# XGBoost with GridSearchCV
xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_grid = GridSearchCV(xgb.XGBClassifier(random_state=42), xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
xgb_grid.fit(X_train, y_train)

print("Best XGBoost parameters:", xgb_grid.best_params_)
print("Best XGBoost CV score:", xgb_grid.best_score_)

In [None]:
# Evaluate XGBoost
xgb_best = xgb_grid.best_estimator_
xgb_pred = xgb_best.predict(X_test)

print("\nXGBoost Test Results:")
print("Accuracy:", accuracy_score(y_test, xgb_pred))
print("Precision:", precision_score(y_test, xgb_pred, average='weighted'))
print("Recall:", recall_score(y_test, xgb_pred, average='weighted'))
print("F1-Score:", f1_score(y_test, xgb_pred, average='weighted'))

print("\nClassification Report:")
print(classification_report(y_test, xgb_pred, target_names=le_priority.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, xgb_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le_priority.classes_, yticklabels=le_priority.classes_)
plt.title('Confusion Matrix - XGBoost (Priority Prediction)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## Workload Balancing Logic

In [None]:
# Calculate current workload for each assignee
workload = df.groupby('assigned_to').agg({
    'estimated_hours': 'sum',
    'task_id': 'count',
    'priority': lambda x: (x == 'High').sum()
}).rename(columns={'task_id': 'task_count', 'priority': 'high_priority_count'})

print("Current Workload Summary:")
workload

In [None]:
# Visualize workload distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

workload['estimated_hours'].plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Total Estimated Hours by Assignee')
axes[0].set_xlabel('Assignee')
axes[0].set_ylabel('Estimated Hours')
axes[0].tick_params(axis='x', rotation=45)

workload['task_count'].plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Task Count by Assignee')
axes[1].set_xlabel('Assignee')
axes[1].set_ylabel('Task Count')
axes[1].tick_params(axis='x', rotation=45)

workload['high_priority_count'].plot(kind='bar', ax=axes[2], color='salmon')
axes[2].set_title('High Priority Tasks by Assignee')
axes[2].set_xlabel('Assignee')
axes[2].set_ylabel('High Priority Count')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Heuristic workload balancing function
def balance_workload(df, max_hours_per_person=40, max_tasks_per_person=10):
    """
    Balance workload using heuristic approach
    """
    balanced_df = df.copy()
    assignees = df['assigned_to'].unique()
    
    # Calculate current workload
    current_workload = df.groupby('assigned_to').agg({
        'estimated_hours': 'sum',
        'task_id': 'count'
    }).rename(columns={'task_id': 'task_count'})
    
    # Find overloaded assignees
    overloaded = current_workload[
        (current_workload['estimated_hours'] > max_hours_per_person) | 
        (current_workload['task_count'] > max_tasks_per_person)
    ]
    
    print("Overloaded assignees:")
    print(overloaded)
    
    # Simple rebalancing: move tasks from overloaded to underloaded assignees
    underloaded = current_workload[
        (current_workload['estimated_hours'] <= max_hours_per_person) & 
        (current_workload['task_count'] <= max_tasks_per_person)
    ]
    
    print("\nUnderloaded assignees:")
    print(underloaded)
    
    # For demonstration, we'll randomly reassign some tasks
    # In a real scenario, this would be more sophisticated
    if len(overloaded) > 0 and len(underloaded) > 0:
        overloaded_tasks = balanced_df[balanced_df['assigned_to'].isin(overloaded.index)]
        underloaded_assignees = underloaded.index.tolist()
        
        # Reassign 20% of overloaded tasks to underloaded assignees
        reassign_count = int(len(overloaded_tasks) * 0.2)
        reassign_indices = np.random.choice(overloaded_tasks.index, reassign_count, replace=False)
        
        for idx in reassign_indices:
            new_assignee = np.random.choice(underloaded_assignees)
            balanced_df.loc[idx, 'assigned_to'] = new_assignee
            print(f"Reassigned task {balanced_df.loc[idx, 'task_id']} to {new_assignee}")
    
    return balanced_df

# Apply workload balancing
balanced_df = balance_workload(df)

# Recalculate workload after balancing
balanced_workload = balanced_df.groupby('assigned_to').agg({
    'estimated_hours': 'sum',
    'task_id': 'count',
    'priority': lambda x: (x == 'High').sum()
}).rename(columns={'task_id': 'task_count', 'priority': 'high_priority_count'})

print("\nBalanced Workload Summary:")
balanced_workload

In [None]:
# Save the best model
# Assuming XGBoost performed better
joblib.dump(xgb_best, '../models/priority_predictor.pkl')
joblib.dump(le_priority, '../models/priority_encoder.pkl')
joblib.dump(tfidf, '../models/tfidf_vectorizer_priority.pkl')
joblib.dump(le_category, '../models/category_encoder.pkl')
joblib.dump(le_assigned_to, '../models/assignee_encoder.pkl')
joblib.dump(le_status, '../models/status_encoder.pkl')

print("Models saved successfully!")

# Save balanced dataset
balanced_df.to_csv('../data/balanced_tasks.csv', index=False)
print("Balanced dataset saved!")