# AI-Powered Task Management System
## Week 1: Exploratory Data Analysis (EDA) and Data Cleaning

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
from wordcloud import WordCloud

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('../data/tasks.csv')
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Basic information about the dataset
print("Dataset info:")
df.info()

print("\nDescriptive statistics:")
df.describe(include='all')

In [None]:
# Check for missing values
print("Missing values:")
df.isnull().sum()

In [None]:
# Data cleaning
# Convert due_date to datetime
df['due_date'] = pd.to_datetime(df['due_date'])

# Check for duplicates
print("Duplicate rows:", df.duplicated().sum())

# Handle any missing values (if any)
# For this dataset, we'll assume no missing values, but in real scenarios:
# df['estimated_hours'].fillna(df['estimated_hours'].median(), inplace=True)

print("Data types after cleaning:")
df.dtypes

## Exploratory Data Analysis

In [None]:
# Distribution of priorities
plt.figure(figsize=(10, 6))
priority_counts = df['priority'].value_counts()
plt.pie(priority_counts.values, labels=priority_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Task Priorities')
plt.show()

In [None]:
# Distribution of statuses
plt.figure(figsize=(12, 6))
status_counts = df['status'].value_counts()
sns.barplot(x=status_counts.index, y=status_counts.values)
plt.title('Distribution of Task Statuses')
plt.xlabel('Status')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution of categories
plt.figure(figsize=(12, 6))
category_counts = df['category'].value_counts()
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Distribution of Task Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Estimated hours distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['estimated_hours'], bins=10, kde=True)
plt.title('Distribution of Estimated Hours')
plt.xlabel('Estimated Hours')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Tasks by assignee
plt.figure(figsize=(12, 6))
assignee_counts = df['assigned_to'].value_counts()
sns.barplot(x=assignee_counts.index, y=assignee_counts.values)
plt.title('Tasks Assigned to Each Person')
plt.xlabel('Assignee')
plt.ylabel('Number of Tasks')
plt.show()

In [None]:
# Correlation heatmap for numerical columns
plt.figure(figsize=(8, 6))
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.show()

## NLP Preprocessing on Task Descriptions

In [None]:
# Initialize NLTK tools
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    filtered_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Apply preprocessing to task descriptions
df['processed_description'] = df['task_description'].apply(preprocess_text)

print("Original vs Processed descriptions:")
for i in range(5):
    print(f"Original: {df['task_description'][i]}")
    print(f"Processed: {df['processed_description'][i]}")
    print()

In [None]:
# Word cloud of task descriptions
all_descriptions = ' '.join(df['processed_description'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Task Descriptions')
plt.show()

In [None]:
# Save the cleaned and preprocessed dataset
df.to_csv('../data/cleaned_tasks.csv', index=False)
print("Cleaned dataset saved to '../data/cleaned_tasks.csv'")