<a href="https://colab.research.google.com/github/Fatima8024/NLP/blob/main/text2vec_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-learn
!pip install pandas
import pandas as pd





In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [None]:
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('COVID Fake News Data.csv')  # Replace with your file path

# Display the first few rows
print(df.head())

# Check class distribution
print(df['outcome'].value_counts(normalize=True))

# Split the dataset into training (70%) and temporary (30%) sets
train_text, temp_text, train_labels, temp_labels = train_test_split(
    df['headlines'],
    df['outcome'],
    stratify=df['outcome'],  # Ensure the class distribution is consistent
    test_size=0.3,
    random_state=42  # For reproducibility
)

# Split the temporary set into validation (15%) and testing (15%) sets
val_text, test_text, val_labels, test_labels = train_test_split(
    temp_text,
    temp_labels,
    stratify=temp_labels,
    test_size=0.5,
    random_state=42
)

# Display the sizes of each set
print(f"Training size: {len(train_text)}")
print(f"Validation size: {len(val_text)}")
print(f"Testing size: {len(test_text)}")


                                           headlines  outcome
0  A post claims compulsory vacination violates t...        0
1  A photo claims that this person is a doctor wh...        0
2  Post about a video claims that it is a protest...        0
3  All deaths by respiratory failure and pneumoni...        0
4  The dean of the College of Biologists of Euska...        0
outcome
0    0.953534
1    0.046466
Name: proportion, dtype: float64
Training size: 7140
Validation size: 1530
Testing size: 1531


In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit and transform on training data
train_tfidf = tfidf_vectorizer.fit_transform(train_text).toarray()

# Transform validation and test data
val_tfidf = tfidf_vectorizer.transform(val_text).toarray()
test_tfidf = tfidf_vectorizer.transform(test_text).toarray()

print(f"Training TF-IDF shape: {train_tfidf.shape}")
print(f"Validation TF-IDF shape: {val_tfidf.shape}")
print(f"Testing TF-IDF shape: {test_tfidf.shape}")


Training TF-IDF shape: (7140, 5000)
Validation TF-IDF shape: (1530, 5000)
Testing TF-IDF shape: (1531, 5000)


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert TF-IDF and labels to tensors
train_seq = torch.tensor(train_tfidf, dtype=torch.float32)
val_seq = torch.tensor(val_tfidf, dtype=torch.float32)
test_seq = torch.tensor(test_tfidf, dtype=torch.float32)

train_y = torch.tensor(train_labels.values, dtype=torch.long)
val_y = torch.tensor(val_labels.values, dtype=torch.long)
test_y = torch.tensor(test_labels.values, dtype=torch.long)

# Wrap tensors in TensorDatasets
train_data = TensorDataset(train_seq, train_y)
val_data = TensorDataset(val_seq, val_y)
test_data = TensorDataset(test_seq, test_y)

# Define data loaders
batch_size = 32

train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=batch_size)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)


In [None]:
import torch
import torch.nn as nn # Import the torch.nn module
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.optim as optim # Import the optim module for optimizers
import torch.nn.functional as F # Import torch.nn.functional



class RNN_Text2Vec(nn.Module):
    def __init__(self):
        super(RNN_Text2Vec, self).__init__()
        self.rnn = nn.RNN(input_size=5000, hidden_size=128, batch_first=True)
        self.fc = nn.Linear(128, 2)  # Output layer
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a dimension for batch processing
        rnn_out, _ = self.rnn(x)
        rnn_out = rnn_out[:, -1, :]  # Take the last hidden state
        logits = self.fc(rnn_out)
        return self.softmax(logits)

# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Use GPU if available, otherwise CPU

# Initialize the model
text2vec_rnn_model = RNN_Text2Vec().to(device)

# Initialize the optimizer
optimizer = optim.Adam(text2vec_rnn_model.parameters(), lr=1e-3) # Example using Adam optimizer


In [None]:
# Define the evaluate_text2vec function here
def evaluate_text2vec():
    text2vec_rnn_model.eval()
    total_loss = 0
    total_correct = 0

    with torch.no_grad():  # No need to calculate gradients during evaluation
        for batch in val_dataloader:
            batch = [t.to(device) for t in batch]
            seq, labels = batch

            preds = text2vec_rnn_model(seq)
            loss = F.cross_entropy(preds, labels)
            total_loss += loss.item()

            _, predicted = torch.max(preds, 1)
            total_correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(val_dataloader)
    accuracy = total_correct / len(val_data)
    return avg_loss, accuracy



In [None]:
def train_text2vec():
    text2vec_rnn_model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        batch = [r.to(device) for r in batch]
        seq, labels = batch

        optimizer.zero_grad()  # Clear gradients
        preds = text2vec_rnn_model(seq)  # Forward pass
        loss = F.cross_entropy(preds, labels)  # Compute loss
        loss.backward()  # Backpropagation

        torch.nn.utils.clip_grad_norm_(text2vec_rnn_model.parameters(), 1.0)  # Clip gradients
        optimizer.step()  # Update parameters

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    return avg_loss


In [None]:



best_valid_loss = float('inf')

for epoch in range(10):  # Adjust epochs as needed
    print(f'\n Epoch {epoch + 1} / 10')
    train_loss = train_text2vec()
    valid_loss, _ = evaluate_text2vec()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(text2vec_rnn_model.state_dict(), 'best_text2vec_rnn_model.pt')

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')



 Epoch 1 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.090
Validation Loss: 0.109

 Epoch 2 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.042
Validation Loss: 0.113

 Epoch 3 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.022
Validation Loss: 0.122

 Epoch 4 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.013
Validation Loss: 0.134

 Epoch 5 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.008
Validation Loss: 0.146

 Epoch 6 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.005
Validation Loss: 0.158

 Epoch 7 / 10
 

In [None]:
# ipython-input-28-584a0181526f
import numpy as np # Import numpy

text2vec_rnn_model.load_state_dict(torch.load('best_text2vec_rnn_model.pt', weights_only=True))
text2vec_rnn_model.eval()

all_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = [r.to(device) for r in batch]
        seq, labels = batch
        preds = text2vec_rnn_model(seq)
        preds = preds.detach().cpu().numpy()
        all_preds.extend(preds)

final_preds = np.argmax(all_preds, axis=1)

# Evaluate model performance
print(classification_report(test_y, final_preds))


              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1460
           1       0.79      0.42      0.55        71

    accuracy                           0.97      1531
   macro avg       0.88      0.71      0.77      1531
weighted avg       0.96      0.97      0.96      1531



In [None]:
class Text2Vec_LSTM(nn.Module):
    def __init__(self, input_size=5000, hidden_size=128, output_size=2):
        super(Text2Vec_LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected output layer
        self.softmax = nn.LogSoftmax(dim=1)  # Output probabilities

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a batch dimension
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the last hidden state
        logits = self.fc(lstm_out)
        probs = self.softmax(logits)
        return probs

# Initialize the model
text2vec_lstm_model = Text2Vec_LSTM(input_size=5000, hidden_size=128, output_size=2)
text2vec_lstm_model = text2vec_lstm_model.to(device)  # Move to GPU if available


In [None]:
# Calculate class weights based on class distribution in your training data
# Assuming 'train_y' contains the labels for your training data
class_counts = torch.bincount(train_y)  # Count occurrences of each class
total_samples = len(train_y)
weights = torch.tensor([total_samples / (2 * count) for count in class_counts], dtype=torch.float32)
weights = weights.to(device)  # Move weights to the same device as your model


optimizer = torch.optim.AdamW(text2vec_lstm_model.parameters(), lr=1e-5)
cross_entropy = nn.NLLLoss(weight=weights)


In [None]:
def train_text2vec_lstm():
    text2vec_lstm_model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        batch = [r.to(device) for r in batch]
        seq, labels = batch

        optimizer.zero_grad()  # Clear gradients
        preds = text2vec_lstm_model(seq)  # Forward pass
        loss = cross_entropy(preds, labels)  # Compute loss
        loss.backward()  # Backpropagation

        torch.nn.utils.clip_grad_norm_(text2vec_lstm_model.parameters(), 1.0)  # Clip gradients
        optimizer.step()  # Update parameters

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    return avg_loss


In [None]:
def evaluate_text2vec_lstm():
    text2vec_lstm_model.eval()
    total_loss = 0
    all_preds = []

    with torch.no_grad():
        for batch in val_dataloader:
            batch = [r.to(device) for r in batch]
            seq, labels = batch
            preds = text2vec_lstm_model(seq)
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()

            preds = preds.detach().cpu().numpy()
            all_preds.extend(preds)

    avg_loss = total_loss / len(val_dataloader)
    return avg_loss, np.array(all_preds)


In [None]:
best_valid_loss = float('inf')

for epoch in range(10):  # Adjust epochs as needed
    print(f'\n Epoch {epoch + 1} / 10')
    train_loss = train_text2vec_lstm()
    valid_loss, _ = evaluate_text2vec_lstm()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(text2vec_lstm_model.state_dict(), 'best_text2vec_lstm_model.pt')

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')



 Epoch 1 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.690
Validation Loss: 0.689

 Epoch 2 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.688
Validation Loss: 0.687

 Epoch 3 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.687
Validation Loss: 0.686

 Epoch 4 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.686
Validation Loss: 0.685

 Epoch 5 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.684
Validation Loss: 0.684

 Epoch 6 / 10
  Batch    50  of    224.
  Batch   100  of    224.
  Batch   150  of    224.
  Batch   200  of    224.

Training Loss: 0.683
Validation Loss: 0.683

 Epoch 7 / 10
 

In [None]:
text2vec_lstm_model.load_state_dict(torch.load('best_text2vec_lstm_model.pt', weights_only=True))
text2vec_lstm_model.eval()

all_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = [r.to(device) for r in batch]
        seq, labels = batch
        preds = text2vec_lstm_model(seq)
        preds = preds.detach().cpu().numpy()
        all_preds.extend(preds)

final_preds = np.argmax(all_preds, axis=1)

# Evaluate model performance
print(classification_report(test_y, final_preds,zero_division=0))


              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1460
           1       0.00      0.00      0.00        71

    accuracy                           0.95      1531
   macro avg       0.48      0.50      0.49      1531
weighted avg       0.91      0.95      0.93      1531



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def compute_test_metrics(model, test_dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for batch in test_dataloader:
            batch = [r.to(device) for r in batch]
            seq, labels = batch
            preds = model(seq)

            # Compute loss
            loss = cross_entropy(preds, labels)
            total_loss += loss.item()

            # Collect predictions and labels
            preds = preds.detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            all_preds.extend(np.argmax(preds, axis=1))
            all_labels.extend(labels)

    # Calculate metrics
    test_accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    avg_loss = total_loss / len(test_dataloader)

    return test_accuracy, precision, recall, f1, avg_loss


In [None]:
# Collect metrics for Text2Vec + RNN
text2vec_rnn_metrics = compute_test_metrics(text2vec_rnn_model, test_dataloader)
print("Text2Vec + RNN Metrics:")
print(f"Accuracy: {text2vec_rnn_metrics[0]:.4f}")
print(f"Precision: {text2vec_rnn_metrics[1]:.4f}")
print(f"Recall: {text2vec_rnn_metrics[2]:.4f}")
print(f"F1-Score: {text2vec_rnn_metrics[3]:.4f}")
print(f"Loss Value: {text2vec_rnn_metrics[4]:.4f}")

# Collect metrics for Text2Vec + LSTM
text2vec_lstm_metrics = compute_test_metrics(text2vec_lstm_model, test_dataloader)
print("\nText2Vec + LSTM Metrics:")
print(f"Accuracy: {text2vec_lstm_metrics[0]:.4f}")
print(f"Precision: {text2vec_lstm_metrics[1]:.4f}")
print(f"Recall: {text2vec_lstm_metrics[2]:.4f}")
print(f"F1-Score: {text2vec_lstm_metrics[3]:.4f}")
print(f"Loss Value: {text2vec_lstm_metrics[4]:.4f}")


Text2Vec + RNN Metrics:
Accuracy: 0.9680
Precision: 0.9640
Recall: 0.9680
F1-Score: 0.9633
Loss Value: 0.7081

Text2Vec + LSTM Metrics:
Accuracy: 0.9536
Precision: 0.9094
Recall: 0.9536
F1-Score: 0.9310
Loss Value: 0.6801


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:


# Create a DataFrame for comparison
comparison_table = pd.DataFrame({
    "Metric": ["Test Accuracy", "Precision", "Recall", "F1-Score", "Loss Value"],
    "Text2Vec + RNN": text2vec_rnn_metrics,
    "Text2Vec + LSTM": text2vec_lstm_metrics
})

# Display the table
print(comparison_table)


          Metric  Text2Vec + RNN  Text2Vec + LSTM
0  Test Accuracy        0.967995         0.953625
1      Precision        0.964049         0.909401
2         Recall        0.967995         0.953625
3       F1-Score        0.963329         0.930988
4     Loss Value        0.708111         0.680105


In [None]:
# prompt: generate the result in table format

# ... (Your existing code)

# Create a DataFrame for comparison
comparison_table = pd.DataFrame({
    "Metric": ["Test Accuracy", "Precision", "Recall", "F1-Score", "Loss Value"],
    "Text2Vec + RNN": text2vec_rnn_metrics,
    "Text2Vec + LSTM": text2vec_lstm_metrics
})

# Display the table with better formatting
print(comparison_table.to_string(index=False)) # Use to_string for cleaner output

       Metric  Text2Vec + RNN  Text2Vec + LSTM
Test Accuracy        0.967995         0.953625
    Precision        0.964049         0.909401
       Recall        0.967995         0.953625
     F1-Score        0.963329         0.930988
   Loss Value        0.708111         0.680105


In [None]:
# prompt: genrate a .csv file here

# ... (Your existing code)

# Create a DataFrame for comparison
comparison_table = pd.DataFrame({
    "Metric": ["Test Accuracy", "Precision", "Recall", "F1-Score", "Loss Value"],
    "Text2Vec + RNN": text2vec_rnn_metrics,
    "Text2Vec + LSTM": text2vec_lstm_metrics
})

# Save the DataFrame to a CSV file
comparison_table.to_csv('model_comparison.csv', index=False)