# AI-Powered Task Management System
## Week 2: Feature Extraction and Task Classification

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the cleaned dataset
df = pd.read_csv('../data/cleaned_tasks.csv')
print("Dataset shape:", df.shape)
df.head()

## Feature Extraction using TF-IDF

In [None]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_description'])

print("TF-IDF feature shape:", X_tfidf.shape)
print("Top 20 features:")
feature_names = tfidf_vectorizer.get_feature_names_out()
print(feature_names[:20])

## Task Classification using Naive Bayes and SVM

In [None]:
# Prepare target variables
le_priority = LabelEncoder()
le_category = LabelEncoder()
le_status = LabelEncoder()

y_priority = le_priority.fit_transform(df['priority'])
y_category = le_category.fit_transform(df['category'])
y_status = le_status.fit_transform(df['status'])

print("Priority classes:", le_priority.classes_)
print("Category classes:", le_category.classes_)
print("Status classes:", le_status.classes_)

In [None]:
# Function to train and evaluate models
def train_evaluate_model(X, y, model, model_name, target_name):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"\n{model_name} - {target_name} Classification Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name} ({target_name})')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    return model, accuracy, precision, recall, f1

In [None]:
# Train Naive Bayes for Priority Classification
nb_priority = MultinomialNB()
nb_priority_model, nb_acc, nb_prec, nb_rec, nb_f1 = train_evaluate_model(X_tfidf, y_priority, nb_priority, "Naive Bayes", "Priority")

# Train SVM for Priority Classification
svm_priority = SVC(kernel='linear', random_state=42)
svm_priority_model, svm_acc, svm_prec, svm_rec, svm_f1 = train_evaluate_model(X_tfidf, y_priority, svm_priority, "SVM", "Priority")

In [None]:
# Train Naive Bayes for Category Classification
nb_category = MultinomialNB()
nb_category_model, nb_cat_acc, nb_cat_prec, nb_cat_rec, nb_cat_f1 = train_evaluate_model(X_tfidf, y_category, nb_category, "Naive Bayes", "Category")

# Train SVM for Category Classification
svm_category = SVC(kernel='linear', random_state=42)
svm_category_model, svm_cat_acc, svm_cat_prec, svm_cat_rec, svm_cat_f1 = train_evaluate_model(X_tfidf, y_category, svm_category, "SVM", "Category")

In [None]:
# Cross-validation scores
def cross_validate_model(X, y, model, model_name, target_name):
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"\n{model_name} - {target_name} Cross-Validation Results:")
    print(f"CV Accuracy Scores: {cv_scores}")
    print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    return cv_scores

# Cross-validate priority models
nb_priority_cv = cross_validate_model(X_tfidf, y_priority, MultinomialNB(), "Naive Bayes", "Priority")
svm_priority_cv = cross_validate_model(X_tfidf, y_priority, SVC(kernel='linear', random_state=42), "SVM", "Priority")

# Cross-validate category models
nb_category_cv = cross_validate_model(X_tfidf, y_category, MultinomialNB(), "Naive Bayes", "Category")
svm_category_cv = cross_validate_model(X_tfidf, y_category, SVC(kernel='linear', random_state=42), "SVM", "Category")

In [None]:
# Save the best models
# For priority prediction, let's assume SVM performed better
joblib.dump(svm_priority_model, '../models/priority_classifier.pkl')
joblib.dump(svm_category_model, '../models/category_classifier.pkl')
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.pkl')
joblib.dump(le_priority, '../models/priority_encoder.pkl')
joblib.dump(le_category, '../models/category_encoder.pkl')

print("Models saved successfully!")

## Model Performance Summary

In [None]:
# Create a summary table
performance_summary = pd.DataFrame({
    'Model': ['Naive Bayes (Priority)', 'SVM (Priority)', 'Naive Bayes (Category)', 'SVM (Category)'],
    'Accuracy': [nb_acc, svm_acc, nb_cat_acc, svm_cat_acc],
    'Precision': [nb_prec, svm_prec, nb_cat_prec, svm_cat_prec],
    'Recall': [nb_rec, svm_rec, nb_cat_rec, svm_cat_rec],
    'F1-Score': [nb_f1, svm_f1, nb_cat_f1, svm_cat_f1]
})

print("Model Performance Summary:")
performance_summary.round(4)