In [5]:
!pip install pandas
!pip install numpy



In [6]:
pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl.metadata (11 kB)
Collecting joblib>=1.1.1 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.3.2-cp38-cp38-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   ---------------------------------------- 0.1/9.3 MB 1.1 MB/s eta 0:00:09
   - -------------------------------------- 0.4/9.3 MB 3.2 MB/s eta 0:00:03
   --- ------------------------------------ 0.7/9.3 MB 4.5 MB/s eta 0:00:02
   ---- ----------------------------------- 1.1/9.3 MB 5.5 MB/s eta 0:00:02
   ------ --------------------------------- 1.5/9.3 MB 5.9 MB/s eta 0:00:02
   -------- ------------------------------- 2.0/9.3 MB 6.2 MB/s eta 0:00:02
   ---------- -

In [7]:
import os  # Importing os to handle file paths
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# Step 1: Load data from specified path
data_path = os.path.join(os.getcwd(), '..', 'data', 'emails_dataset.csv')
data = pd.read_csv(data_path)  # Load your CSV file

In [11]:
# Step 2: Define categories based on 'from' column
def categorize_email(email_sender):
    # Check for specific keywords in the sender's email address
    if 'student' in email_sender:
        return 'student'
    elif 'researcher' in email_sender:
        return 'researcher'
    elif 'corporate' in email_sender:
        return 'corporate'
    else:
        return 'corporate'  # Default category for any other corporate emails


In [12]:
# Apply categorization
data['category'] = data['from'].apply(categorize_email)


In [13]:
# Step 3: Create a custom mapping for the categories
category_mapping = {
    'student': 0,
    'researcher': 1,
    'corporate': 2
}

# Apply the mapping to the 'category' column
data['label'] = data['category'].map(category_mapping)



In [15]:
# Check if mapping is applied correctly
print(data[['category', 'label']])

       category  label
0       student      0
1       student      0
2       student      0
3       student      0
4       student      0
..          ...    ...
145  researcher      1
146  researcher      1
147  researcher      1
148  researcher      1
149  researcher      1

[150 rows x 2 columns]


In [17]:
# Step 4: Prepare data for training

data['combined_text'] = data['subject'] + ' ' + data['body']

# Use the combined feature as input (X)
X = data['combined_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
# Step 5: Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class EmailDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        # Tokenization
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



Downloading vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 604kB/s] 
Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 47.7kB/s]
Downloading config.json: 100%|██████████| 483/483 [00:00<?, ?B/s] 


In [19]:
# Step 6: Create DataLoader
train_dataset = EmailDataset(X_train.tolist(), y_train.tolist())
test_dataset = EmailDataset(X_test.tolist(), y_test.tolist())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [20]:
# Step 7: Model training
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)



device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

# Training Loop
for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')



Downloading pytorch_model.bin: 100%|██████████| 256M/256M [00:35<00:00, 7.55MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased 

Epoch 1, Loss: 1.0653200894594193
Epoch 2, Loss: 0.852732740342617
Epoch 3, Loss: 0.549541536718607
Epoch 4, Loss: 0.2967634052038193
Epoch 5, Loss: 0.16304591950029135
Epoch 6, Loss: 0.09636252466589212
Epoch 7, Loss: 0.06823247531428933
Epoch 8, Loss: 0.046611853409558535
Epoch 9, Loss: 0.037074119085446
Epoch 10, Loss: 0.02926712087355554


In [21]:
# Step 8: Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch['labels'].numpy())


In [26]:
# Step 10: Save the model and tokenizer
model_dir = os.path.join(os.getcwd(), 'data', 'distilbert_email_model')
os.makedirs(model_dir, exist_ok=True)


In [27]:
# Save the model and tokenizer
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f'Model and tokenizer saved to {model_dir}')

Model and tokenizer saved to c:\Users\ishwa\Python_code\email-management-system\src\data\distilbert_email_model
