# Assignment 0

    Author: Group F - Gaurav, Xiaowen Sun, Jheel Harnish Kamdar, Ruijia Xiong
    Created at: 02/15/2023

In [20]:
# Data processing and visualization
import pandas as pd
# Natural language processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

# Machine learning and deep learning libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [55]:
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/News_Category_Dataset_v3.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
# Assuming the dataset is in CSV format
df = pd.read_csv(file_path, index_col='Unnamed: 0')
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 209527 entries, 0 to 209526
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   headline                  209521 non-null  object
 1   category                  209527 non-null  object
 2   short_description         189815 non-null  object
 3   authors                   172109 non-null  object
 4   date                      209527 non-null  object
 5   headline_length           209527 non-null  int64 
 6   short_description_length  209527 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 12.8+ MB


In [57]:
df['category'] = df['category'].replace('PARENTS', 'PARENTING')
df['category'] = df['category'].replace('STYLE', 'STYLE & BEAUTY')
df['category'] = df['category'].replace('THE WORLDPOST', 'WORLDPOST')

df['category'] = df['category'].replace('ARTS', 'ARTS & CULTURE')
df['category'] = df['category'].replace('CULTURE & ARTS', 'ARTS & CULTURE')

In [58]:
df['category'].nunique()

37

 1m run for preprocess_text

In [59]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Check if the text is a string
    if not isinstance(text, str):
        return text

    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    # Remove single characters from the start
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    # remove text in square brackets
    text = re.sub('\[.*?\]', '', text)
    # remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # Remove stopwords
    tokens = word_tokenize(text)
    filtered_text = ' '.join([word for word in tokens if word not in stop_words])
    return filtered_text


# Applying the preprocess function to each row of the headline and short description
df['cleaned_headline'] = df['headline'].apply(preprocess_text)
df['cleaned_short_description'] = df['short_description'].apply(preprocess_text)
df.head(5)

Unnamed: 0,headline,category,short_description,authors,date,headline_length,short_description_length,cleaned_headline,cleaned_short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,76,154,million americans roll sleeves omicron targete...,health experts said early predict whether dema...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,89,159,american airlines flyer charged banned life pu...,subdued passengers crew fled back aircraft con...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,69,64,funniest tweets cats dogs week sept,dog understand could eaten
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,56,159,funniest tweets parents week sept,accidentally put grown toothpaste toddler toot...
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,77,156,woman called cops black bird watcher loses law...,amy cooper accused investment firm franklin te...


In [60]:
# Combine cleaned headline and short description for vectorization
df['combined_text'] = df['cleaned_headline'] + ' ' + df['cleaned_short_description']
df['combined_text'] = df['combined_text'].fillna('')

#### transformer experiment

In [61]:
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(df['category'])

In [62]:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

In [63]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [64]:
# Define the class for the BERT-based text classifier
class BertTextClassifier(nn.Module):
    def __init__(self, num_classes):
        super(BertTextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output)


# Split the dataset into training, validation, and testing sets
# train_texts, test_texts, train_labels, test_labels = train_test_split(df['combined_text'], df['category'], test_size=0.2, random_state=42)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['combined_text'], encoded_labels, test_size=0.2, random_state=42)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42)

# Tokenize the text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [65]:
# Convert the tokenized data into PyTorch tensors
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)
test_dataset = NewsDataset(test_encodings, test_labels)

In [66]:
# Define the training parameters
BATCH_SIZE = 16
EPOCHS = 3

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Initialize the BERT-based text classifier model
model = BertTextClassifier(num_classes=len(df['category'].unique())).to(device)

# Define the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define the loss function
criterion = nn.CrossEntropyLoss()



In [67]:
# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        print('input_ids dtype:', type(input_ids))
        print('attention_mask dtype:', type(attention_mask))
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)

        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation loop
    model.eval()
    val_preds = []
    val_targets = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            val_preds.extend(predicted.cpu().numpy())
            val_targets.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_targets, val_preds)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Training Loss: {avg_train_loss}, Validation Accuracy: {val_accuracy}')




  input_ids = torch.tensor(batch['input_ids']).to(device)


input_ids dtype: <class 'torch.Tensor'>
attention_mask dtype: <class 'torch.Tensor'>


TypeError: dropout(): argument 'input' (position 1) must be Tensor, not str

In [None]:
# Testing loop
model.eval()
test_preds = []
test_targets = []
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        test_preds.extend(predicted.cpu().numpy())
        test_targets.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(test_targets, test_preds)
print(f'Test Accuracy: {test_accuracy}')