# Document Classification
## Glossary
1- Import Libraries

2- Utility Function

3- helper Function

4- FCNN Model

5- Distilbert Model

6- RoBERTa Model

7- Extracting Features

8- FCNN Model Training

9- Distilbert Model Training

10- RoBERTa Model Training

11- Display the graph

12- Save Dataset in MongoDB

13- Save Models in MongoDB

## Import Libraries

In [19]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l1_l2

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm

import pickle
from pymongo import MongoClient

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Virus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Virus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Virus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Utility Function

In [21]:
# Utility Function
def setup_gpu():
    try:
        # Prevent TensorFlow from taking all GPU memory
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            print("GPU setup completed")
        else:
            print("No GPU devices found")
    except Exception as e:
        print(f"GPU setup failed: {e}")
        print("Falling back to CPU")
        # Disable GPU
        tf.config.set_visible_devices([], 'GPU')


# Text preprocessing functions
def setup_nltk():
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    return stop_words, lemmatizer

# Data preparation
def prepare_data():
    # Load dataset
    path = kagglehub.dataset_download("alfathterry/bbc-full-text-document-classification")
    df = pd.read_csv(f"{path}/bbc_data.csv")
    
    # Clean data
    df_cleaned = df.copy()
    df_cleaned['processed_data'] = df_cleaned['data'].apply(preprocess_text)
    
    # Extract features
    text_features = [text_analysis_helper(text) for text in tqdm(df_cleaned['data'], desc="Extracting features")]
    for key in text_features[0].keys():
        df_cleaned[key] = [f[key] for f in text_features]
    
    # Add ratio features
    df_cleaned['unique_word_ratio'] = df_cleaned['unique_words'] / df_cleaned['processed_word_count']
    df_cleaned['preprocessing_reduction_ratio'] = df_cleaned['processed_word_count'] / df_cleaned['word_count']
    
    # Encode labels
    label_encoder = LabelEncoder()
    df_cleaned['labels_encoded'] = label_encoder.fit_transform(df_cleaned['labels'])
    
    return df_cleaned, label_encoder


## Helper Function

In [55]:
# Helper Function
def preprocess_text(text):
    stop_words, lemmatizer = setup_nltk()
    text = str(text)
    tokens = word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(token) for token in tokens 
            if token.isalpha() and token not in stop_words]
    return " ".join(words)

def text_analysis_helper(text):
    processed_text = preprocess_text(text)
    words = processed_text.split()
    raw_words = str(text).split()
    
    return {
        'length': len(text),
        'processed_length': len(processed_text),
        'word_count': len(raw_words),
        'processed_word_count': len(words),
        'avg_word_length': np.mean([len(word) for word in words]) if words else 0,
        'unique_words': len(set(words)),
        'capital_letters': sum(1 for c in text if c.isupper()),
        'punctuation_count': sum(1 for c in text if c in '.,!?'),
        'stopwords_removed': len(raw_words) - len(words),
        'lexical_density': len(set(words)) / len(words) if words else 0
    }

## FCNN Model

In [43]:
# FCNN
class NeuralNetworkClassifier:
    def __init__(self, input_dim, num_classes):
        self.model = Sequential()
        
        # Input layer
        self.model.add(Dense(input_dim, input_shape=(input_dim,)))
        self.model.add(BatchNormalization())
        
        # First hidden layer
        self.model.add(Dense(512, activation='relu', 
                             kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)))
        self.model.add(Dropout(0.4))
        self.model.add(BatchNormalization())
        
        # Second hidden layer
        self.model.add(Dense(256, activation='relu',
                             kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)))
        self.model.add(Dropout(0.3))
        self.model.add(BatchNormalization())
        
        # Third hidden layer
        self.model.add(Dense(128, activation='relu',
                             kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)))
        self.model.add(Dropout(0.2))
        self.model.add(BatchNormalization())
        
        # Output layer
        self.model.add(Dense(num_classes, activation='softmax'))

        # Fix: Use `self.model.summary()` to correctly call the summary method
        self.model.summary()
        
        # Compile model
        self.model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )
        
    def train(self, X_train, y_train, epochs=100, batch_size=32):
        callbacks = [
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.2,
                patience=3
            )
        ]
        
        return self.model.fit(
            X_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=1
        )

## Distilbert Model

In [5]:
# Distilbert Model
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class TransformerClassifier:
    def __init__(self, num_labels):
        self.model_name = "distilbert-base-uncased"  # First pre-trained model
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
    def create_data_loaders(self, X_train, y_train, X_test, y_test, batch_size=16):
        train_dataset = TextClassificationDataset(X_train, y_train, self.tokenizer)
        valid_dataset = TextClassificationDataset(X_test, y_test, self.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
        return train_loader, valid_loader
    
    def train_epoch(self, data_loader, optimizer):
        self.model.train()
        total_loss = 0
        
        for batch in tqdm(data_loader, desc='Training'):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        return total_loss / len(data_loader)
    
    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0
        predictions = []
        actual_labels = []
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc='Evaluating'):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                total_loss += outputs.loss.item()
                predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
                actual_labels.extend(labels.cpu().numpy())
        
        accuracy = np.mean(np.array(predictions) == np.array(actual_labels))
        return total_loss / len(data_loader), accuracy

## RoBERTa Model

In [6]:
# RoBERTa Model
class RoBertaClassifier:
    def __init__(self, num_labels):
        self.model_name = "roberta-base"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=num_labels
        )
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
    
    def create_data_loaders(self, X_train, y_train, X_test, y_test, batch_size=16):
        train_dataset = TextClassificationDataset(X_train, y_train, self.tokenizer)
        valid_dataset = TextClassificationDataset(X_test, y_test, self.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        
        return train_loader, valid_loader
    
    def train_epoch(self, data_loader, optimizer):
        self.model.train()
        total_loss = 0
        
        for batch in tqdm(data_loader, desc='Training RoBERTa'):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        return total_loss / len(data_loader)
    
    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0
        predictions = []
        actual_labels = []
        
        with torch.no_grad():
            for batch in tqdm(data_loader, desc='Evaluating RoBERTa'):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                total_loss += outputs.loss.item()
                predictions.extend(outputs.logits.argmax(dim=-1).cpu().numpy())
                actual_labels.extend(labels.cpu().numpy())
        
        accuracy = np.mean(np.array(predictions) == np.array(actual_labels))
        return total_loss / len(data_loader), accuracy

## Extracting Features

In [65]:
setup_gpu()
print("Preparing data...")
df_cleaned, label_encoder = prepare_data()

No GPU devices found
Preparing data...


Extracting features: 100%|██████████| 2225/2225 [00:10<00:00, 205.33it/s]


## Data Preprocessing

In [67]:
df_cleaned.head()

Unnamed: 0,data,labels,processed_data,length,processed_length,word_count,processed_word_count,avg_word_length,unique_words,capital_letters,punctuation_count,stopwords_removed,lexical_density,unique_word_ratio,preprocessing_reduction_ratio,labels_encoded
0,Musicians to tackle US red tape Musicians gro...,entertainment,musician tackle u red tape musician group tack...,2254,1439,378,205,6.02439,148,87,35,173,0.721951,0.721951,0.542328,1
1,"U2s desire to be number one U2, who have won ...",entertainment,desire number one three prestigious grammy awa...,4799,2902,838,427,5.798595,317,157,93,411,0.742389,0.742389,0.509547,1
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment,rocker doherty fight rock singer pete doherty ...,2125,1383,358,195,6.097436,151,61,39,163,0.774359,0.774359,0.544693,1
3,Snicket tops US box office chart The film ada...,entertainment,snicket top u box office chart film adaptation...,1052,724,177,108,5.712963,84,49,23,69,0.777778,0.777778,0.610169,1
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment,ocean twelve raid box office ocean twelve crim...,1598,1082,251,160,5.76875,128,78,41,91,0.8,0.8,0.63745,1


In [71]:
df_cleaned.shape

(2225, 16)

In [73]:
df_cleaned.describe()

Unnamed: 0,length,processed_length,word_count,processed_word_count,avg_word_length,unique_words,capital_letters,punctuation_count,stopwords_removed,lexical_density,unique_word_ratio,preprocessing_reduction_ratio,labels_encoded
count,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0,2225.0
mean,2262.262472,1480.731685,384.166292,211.814831,5.991931,145.226517,74.214382,36.395955,172.351461,0.706947,0.706947,0.556379,1.958202
std,1361.230919,858.380987,238.14189,123.26026,0.321922,69.031291,44.908434,24.27866,116.962699,0.069232,0.069232,0.040482,1.42831
min,502.0,333.0,89.0,47.0,4.776978,40.0,14.0,5.0,27.0,0.323671,0.323671,0.418803,0.0
25%,1448.0,950.0,246.0,137.0,5.766423,100.0,47.0,23.0,107.0,0.658892,0.658892,0.530815,1.0
50%,1965.0,1298.0,332.0,185.0,5.98913,131.0,65.0,32.0,148.0,0.704348,0.704348,0.556391,2.0
75%,2801.0,1841.0,472.0,262.0,6.216102,175.0,90.0,43.0,210.0,0.754545,0.754545,0.581015,3.0
max,25416.0,15743.0,4432.0,2184.0,7.00625,1076.0,639.0,428.0,2248.0,0.947368,0.947368,0.804444,4.0


In [87]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   data                           2225 non-null   object 
 1   labels                         2225 non-null   object 
 2   processed_data                 2225 non-null   object 
 3   length                         2225 non-null   int64  
 4   processed_length               2225 non-null   int64  
 5   word_count                     2225 non-null   int64  
 6   processed_word_count           2225 non-null   int64  
 7   avg_word_length                2225 non-null   float64
 8   unique_words                   2225 non-null   int64  
 9   capital_letters                2225 non-null   int64  
 10  punctuation_count              2225 non-null   int64  
 11  stopwords_removed              2225 non-null   int64  
 12  lexical_density                2225 non-null   f

In [91]:
numeric_df = df_cleaned.select_dtypes(include=['number'])

numeric_df.corr()

Unnamed: 0,length,processed_length,word_count,processed_word_count,avg_word_length,unique_words,capital_letters,punctuation_count,stopwords_removed,lexical_density,unique_word_ratio,preprocessing_reduction_ratio,labels_encoded
length,1.0,0.99516,0.997,0.994629,0.058389,0.970447,0.793505,0.937801,0.98176,-0.507161,-0.507161,-0.163908,0.183174
processed_length,0.99516,1.0,0.987366,0.996387,0.09293,0.973657,0.805294,0.930353,0.960292,-0.524924,-0.524924,-0.089538,0.167985
word_count,0.997,0.987366,1.0,0.991781,0.006494,0.964368,0.79183,0.936808,0.990868,-0.49648,-0.49648,-0.200015,0.190388
processed_word_count,0.994629,0.996387,0.991781,1.0,0.018583,0.972889,0.821662,0.937707,0.965474,-0.529362,-0.529362,-0.095179,0.174149
avg_word_length,0.058389,0.09293,0.006494,0.018583,1.0,0.04101,-0.117675,-0.014635,-0.006361,0.016759,0.016759,0.088533,-0.09816
unique_words,0.970447,0.973657,0.964368,0.972889,0.04101,1.0,0.796916,0.912652,0.938229,-0.414305,-0.414305,-0.117179,0.18162
capital_letters,0.793505,0.805294,0.79183,0.821662,-0.117675,0.796916,1.0,0.811251,0.746304,-0.435587,-0.435587,0.086093,0.114178
punctuation_count,0.937801,0.930353,0.936808,0.937707,-0.014635,0.912652,0.811251,1.0,0.919192,-0.441101,-0.441101,-0.109504,0.125616
stopwords_removed,0.98176,0.960292,0.990868,0.965474,-0.006361,0.938229,0.746304,0.919192,1.0,-0.452994,-0.452994,-0.306937,0.204115
lexical_density,-0.507161,-0.524924,-0.49648,-0.529362,0.016759,-0.414305,-0.435587,-0.441101,-0.452994,1.0,1.0,-0.077857,-0.100969


## FCNN Model Training

In [45]:
# Feature columns for fully connected network
feature_columns = [
    'length', 'processed_length', 'word_count', 'processed_word_count',
    'avg_word_length', 'unique_words', 'capital_letters', 'punctuation_count',
    'stopwords_removed', 'lexical_density', 'unique_word_ratio',
    'preprocessing_reduction_ratio'
]
    
# Prepare features and labels for FCNN
X = df_cleaned[feature_columns].values
y = df_cleaned['labels_encoded'].values
    
# Split data for FCNN
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
    
# Scale features for FCNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
    
print("\nTraining Fully Connected Neural Network...")
fcnn_classifier = NeuralNetworkClassifier(
    input_dim=len(feature_columns),
    num_classes=len(label_encoder.classes_)
)
fcnn_history = fcnn_classifier.train(X_train_scaled, y_train)
    
# Evaluate FCNN
fcnn_test_loss, fcnn_test_accuracy = fcnn_classifier.model.evaluate(X_test_scaled, y_test)
print(f"\nFCNN Test Accuracy: {fcnn_test_accuracy:.4f}")

No GPU devices found
Preparing data...


Extracting features: 100%|██████████| 2225/2225 [00:10<00:00, 208.67it/s]
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Fully Connected Neural Network...


Epoch 1/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.2915 - loss: 2.8454 - val_accuracy: 0.3933 - val_loss: 2.3763 - learning_rate: 5.0000e-04
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3950 - loss: 2.4373 - val_accuracy: 0.4045 - val_loss: 2.3310 - learning_rate: 5.0000e-04
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.3822 - loss: 2.4724 - val_accuracy: 0.4157 - val_loss: 2.2812 - learning_rate: 5.0000e-04
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4423 - loss: 2.3161 - val_accuracy: 0.4551 - val_loss: 2.2328 - learning_rate: 5.0000e-04
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4309 - loss: 2.3051 - val_accuracy: 0.4691 - val_loss: 2.1849 - learning_rate: 5.0000e-04
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0

## DistilBERT Model Training

In [8]:
# Transformer setup
X_text = df_cleaned['data'].values
y = df_cleaned['labels_encoded'].values
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)
    
print("\nTraining DistilBERT...")
distilbert_classifier = TransformerClassifier(num_labels=len(label_encoder.classes_))
distil_train_loader, distil_valid_loader = distilbert_classifier.create_data_loaders(
    X_train_text, y_train, X_test_text, y_test
)
    
distil_optimizer = torch.optim.AdamW(distilbert_classifier.model.parameters(), lr=2e-5)
best_accuracy_distil = 0
    
for epoch in range(3):
    print(f"\nDistilBERT Epoch {epoch + 1}/3")
    train_loss = distilbert_classifier.train_epoch(distil_train_loader, distil_optimizer)
    val_loss, val_accuracy = distilbert_classifier.evaluate(distil_valid_loader)
    
    print(f"DistilBERT Train Loss: {train_loss:.4f}")
    print(f"DistilBERT Val Loss: {val_loss:.4f}")
    print(f"DistilBERT Val Accuracy: {val_accuracy:.4f}")
        
    if val_accuracy > best_accuracy_distil:
        best_accuracy_distil = val_accuracy
        torch.save(distilbert_classifier.model.state_dict(), 'best_distilbert_model.pt')


Training DistilBERT...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



DistilBERT Epoch 1/3


Training: 100%|██████████| 112/112 [32:01<00:00, 17.16s/it]
Evaluating: 100%|██████████| 28/28 [02:10<00:00,  4.66s/it]


DistilBERT Train Loss: 0.6269
DistilBERT Val Loss: 0.1393
DistilBERT Val Accuracy: 0.9753

DistilBERT Epoch 2/3


Training: 100%|██████████| 112/112 [34:54<00:00, 18.70s/it]
Evaluating: 100%|██████████| 28/28 [02:38<00:00,  5.64s/it]


DistilBERT Train Loss: 0.0965
DistilBERT Val Loss: 0.0769
DistilBERT Val Accuracy: 0.9843

DistilBERT Epoch 3/3


Training: 100%|██████████| 112/112 [34:17<00:00, 18.37s/it]
Evaluating: 100%|██████████| 28/28 [02:21<00:00,  5.06s/it]

DistilBERT Train Loss: 0.0434
DistilBERT Val Loss: 0.0842
DistilBERT Val Accuracy: 0.9775





## RoBERTa Model Training

In [9]:
print("\nTraining RoBERTa...")
roberta_classifier = RoBertaClassifier(num_labels=len(label_encoder.classes_))
roberta_train_loader, roberta_valid_loader = roberta_classifier.create_data_loaders(
    X_train_text, y_train, X_test_text, y_test
)
    
roberta_optimizer = torch.optim.AdamW(roberta_classifier.model.parameters(), lr=2e-5)
best_accuracy_roberta = 0
    
for epoch in range(3):
    print(f"\nRoBERTa Epoch {epoch + 1}/3")
    train_loss = roberta_classifier.train_epoch(roberta_train_loader, roberta_optimizer)
    val_loss, val_accuracy = roberta_classifier.evaluate(roberta_valid_loader)
        
    print(f"RoBERTa Train Loss: {train_loss:.4f}")
    print(f"RoBERTa Val Loss: {val_loss:.4f}")
    print(f"RoBERTa Val Accuracy: {val_accuracy:.4f}")
        
    if val_accuracy > best_accuracy_roberta:
        best_accuracy_roberta = val_accuracy
        torch.save(roberta_classifier.model.state_dict(), 'best_roberta_model.pt')


Training RoBERTa...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



RoBERTa Epoch 1/3


Training RoBERTa: 100%|██████████| 112/112 [1:25:53<00:00, 46.01s/it]
Evaluating RoBERTa: 100%|██████████| 28/28 [05:32<00:00, 11.89s/it]


RoBERTa Train Loss: 0.4739
RoBERTa Val Loss: 0.1070
RoBERTa Val Accuracy: 0.9708

RoBERTa Epoch 2/3


Training RoBERTa: 100%|██████████| 112/112 [1:25:06<00:00, 45.60s/it]
Evaluating RoBERTa: 100%|██████████| 28/28 [04:35<00:00,  9.85s/it]


RoBERTa Train Loss: 0.0618
RoBERTa Val Loss: 0.1858
RoBERTa Val Accuracy: 0.9371

RoBERTa Epoch 3/3


Training RoBERTa: 100%|██████████| 112/112 [1:23:23<00:00, 44.67s/it]
Evaluating RoBERTa: 100%|██████████| 28/28 [04:15<00:00,  9.13s/it]


RoBERTa Train Loss: 0.0435
RoBERTa Val Loss: 0.0804
RoBERTa Val Accuracy: 0.9820


In [103]:
print(f"\nBest DistilBERT Accuracy: {best_accuracy_distil:.4f}")
print(f"Best RoBERTa Accuracy: {best_accuracy_roberta:.4f}")
print(f"Best FCNN Accuracy: {fcnn_test_accuracy:.4f}")
print("\nTraining completed!")

NameError: name 'best_accuracy_distil' is not defined

## Display the graph

In [101]:
models = ['DistilBERT', 'RoBERTa', 'FCNN']
accuracies = [best_accuracy_distil, best_accuracy_roberta, fcnn_test_accuracy]

# Plotting the graph
plt.figure(figsize=(8, 6))
plt.bar(models, accuracies, color=['skyblue', 'orange', 'green'], edgecolor='black')

# Adding details to the graph
plt.title('Model Accuracy Comparison', fontsize=16)
plt.ylabel('Accuracy', fontsize=14)
plt.ylim(0, 1)  # Accuracy range is typically 0 to 1
for i, acc in enumerate(accuracies):
    plt.text(i, acc + 0.02, f"{acc:.2f}", ha='center', fontsize=12, color='black')

# Display the graph
plt.tight_layout()
plt.show()

NameError: name 'best_accuracy_distil' is not defined

## Save Dataset in MongoDB

In [None]:
# Access the database (create it if it doesn't exist)
db = client["DLDB"]

# Access the collection (create it if it doesn't exist)
collection = db["Dataset"]

# Convert DataFrame to a list of dictionaries
data = df.to_dict(orient="records")

# Insert the data into the collection
collection.insert_many(data)

print("Dataset successfully saved to MongoDB!")

## Save Models in MongoDB

In [105]:
model_binary = pickle.dumps(fcnn_classifier)

client = MongoClient("mongodb://localhost:27017/")  # Connect to local MongoDB
db = client['ml_models']  # Database
collection = db['models']  # Collection

model_document = {
    "model_name": "FCNN Model",
    "model_binary": model_binary
}

model_document = {
    "model_name": "DistilBERT Model",
    "framework": "scikit-learn",
    "model_binary": model_binary
}

model_document = {
    "model_name": "RoBERTa Model",
    "framework": "scikit-learn",
    "model_binary": model_binary
}

collection.insert_one(model_document)

print("Model saved successfully in MongoDB!")

Model saved successfully in MongoDB!
