In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load dataset
df = pd.read_csv('C:\Office work\Task 2\emails.csv')

# Display the first few rows to understand the data
print("First few rows of the dataset:")
print(df.head())

# Fill missing values in 'Subject' and 'Body' columns
df['Subject'].fillna('', inplace=True)
df['Body'].fillna('', inplace=True)

# Function for text cleaning
def clean_text(text):
    text = text.lower()                          # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)             # Remove extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)      # Remove punctuation, numbers, special characters
    tokens = word_tokenize(text)                 # Tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')] # Remove stopwords
    return ' '.join(tokens)

# Apply the function to Subject and Body
df['cleaned_subject'] = df['Subject'].apply(clean_text)
df['cleaned_body'] = df['Body'].apply(clean_text)

# Combine the cleaned Subject and Body text for vectorization
df['combined_text'] = df['cleaned_subject'] + ' ' + df['cleaned_body']

# Use TF-IDF for feature extraction
vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features
X = vectorizer.fit_transform(df['combined_text'])
y = df['Category']

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize the model (Naive Bayes example)
model = MultinomialNB()

# Optional: Hyperparameter tuning with Grid Search
params = {'alpha': [0.1, 0.5, 1.0]}
grid_search = GridSearchCV(model, params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best model after tuning
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Important')
recall = recall_score(y_test, y_pred, pos_label='Important')
f1 = f1_score(y_test, y_pred, pos_label='Important')
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the performance metrics
print("Model Performance Metrics:")
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Find misclassified samples
misclassified = (y_pred != y_test)
misclassified_df = pd.DataFrame({'Actual': y_test[misclassified], 'Predicted': y_pred[misclassified]})

print("Misclassified Samples:")
print(misclassified_df.head())


ParserError: Error tokenizing data. C error: Expected 4 fields in line 55, saw 5
