In [50]:
import numpy as np
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [4]:
df=pd.read_csv('/content/email_data.csv')

In [5]:
df.head()

Unnamed: 0,email_text,category
0,Subject: Network Maintenance on Friday\nDear T...,IT
1,"Subject: New Software Installation\nHi John,\n...",IT
2,Subject: Password Expiration Reminder\nDear Al...,IT
3,"Subject: Help Desk Ticket #1234\nHi Michael,\n...",IT
4,"Subject: IT Budget Meeting\nDear Team,\nOur IT...",IT


In [6]:
df.isna().sum()

Unnamed: 0,0
email_text,0
category,0


In [7]:
df['email_text'] = df['email_text'].str.strip()

In [8]:
df['email_text'] = df['email_text'].apply(lambda x: re.sub(r'^Subject:.*?\n', '', x, flags=re.IGNORECASE))

In [9]:
df['email_text'][0]

'Dear Team,\nThis is a reminder that our network team will be performing maintenance on Friday from 5 PM to 7 PM. Please save your work and log off the network during this time.\nBest, IT Department'

In [10]:
df['email_text'] = df['email_text'].str.replace(r'\n', ' ', regex=True).str.strip()

In [12]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
HR,604
IT,601
Other,600


In [14]:
X = df['email_text']
y = df['category']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [16]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Logistic Regression

In [17]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

In [41]:
y_pred = model.predict(X_test_vec)

In [42]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

Accuracy: 0.9667590027700831
Classification Report:               precision    recall  f1-score   support

          HR       0.98      0.97      0.97       121
          IT       0.95      0.99      0.97       120
       Other       0.97      0.94      0.95       120

    accuracy                           0.97       361
   macro avg       0.97      0.97      0.97       361
weighted avg       0.97      0.97      0.97       361



Random Forest Classifier

In [22]:
model1 = RandomForestClassifier()
model1.fit(X_train_vec, y_train)

In [43]:
y_pred = model1.predict(X_test_vec)

In [44]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:", classification_report(y_test, y_pred))

Accuracy: 0.9307479224376731
Classification Report:               precision    recall  f1-score   support

          HR       0.97      0.97      0.97       121
          IT       0.91      0.93      0.92       120
       Other       0.91      0.89      0.90       120

    accuracy                           0.93       361
   macro avg       0.93      0.93      0.93       361
weighted avg       0.93      0.93      0.93       361



XGBoost Classifier

In [45]:
model2 = XGBClassifier(use_label_encoder=False,enable_categorical=True,eval_metric='mlogloss')

In [49]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [51]:
model2.fit(X_train_vec, y_train_enc)

Parameters: { "use_label_encoder" } are not used.



In [52]:
y_pred = model2.predict(X_test_vec)

In [53]:
print("Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:\n", classification_report(y_test_enc, y_pred))

Accuracy: 0.9279778393351801

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       121
           1       0.92      0.91      0.92       120
           2       0.91      0.90      0.90       120

    accuracy                           0.93       361
   macro avg       0.93      0.93      0.93       361
weighted avg       0.93      0.93      0.93       361



In [54]:
sample_email = ["To: All Staff Please be reminded to recycle paper, plastic, and glass in the designated recycling bins. Thanks, [Your Name]"]
sample_vec = vectorizer.transform(sample_email)
prediction = model.predict(sample_vec)
probs = model.predict_proba(sample_vec)
confidences = probs.max(axis=1)
print(probs)
print('Confidence:',confidences)
print("Predicted category:", prediction[0])

[[0.09340061 0.1712344  0.73536499]]
Confidence: [0.73536499]
Predicted category: Other


In [55]:
y_pred = model.predict(X_test_vec)
y_proba = model.predict_proba(X_test_vec)
confidence_scores = np.max(y_proba, axis=1)

output_list = []

for i in range(len(X_test)):
    output_list.append({
        "email_text": X_test.iloc[i],
        "predicted_category": y_pred[i],
        "confidence": round(confidence_scores[i], 2)
    })

In [60]:
len(output_list)

361

In [61]:
import json
for item in output_list[:5]:
    print(json.dumps(item, indent=2))

{
  "email_text": "Dear Employees, Please review the updated retirement plan documents and make any necessary changes to your contributions. Best, [HR Representative]",
  "predicted_category": "HR",
  "confidence": 0.91
}
{
  "email_text": "Hi Team, We will be reviewing the IT budget on Friday at 10 AM. Please come prepared with any questions or concerns. Best, IT Manager",
  "predicted_category": "IT",
  "confidence": 0.78
}
{
  "email_text": "Dear Team, We will be conducting a data backup on Friday at 5 PM. Please save your work and log off before then. Thanks, IT Department",
  "predicted_category": "IT",
  "confidence": 0.94
}
{
  "email_text": "Hello, A new printer has been installed on the 2nd floor. The printer's IP address is 192.168.1.100. Please update your printer settings accordingly. Best, IT",
  "predicted_category": "IT",
  "confidence": 0.62
}
{
  "email_text": "Dear Hiring Managers, Please ensure all new hires complete the mandatory onboarding program within their firs

In [64]:
import json
with open("email_predictions.json", "w", encoding="utf-8") as f:
    json.dump(output_list, f, indent=2, ensure_ascii=False)

print("Predictions saved to 'email_predictions.json'")


Predictions saved to 'email_predictions.json'


In [65]:
import pickle
with open("email_classifier_model.pkl", "wb") as f:
    pickle.dump(model, f)


with open("email_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)