In [1]:
#Task 1 Implementing a Transformer Encoder
import numpy as np
import os
import pandas as pd
import glob
import torch
import torch.nn as nn

In [2]:
#Reading in the AG News Dataset
ag_news = glob.glob('./archive-2/*.csv')
#Make an empty list
df_list = []
#Convert the files into pandas dataframe
for file in ag_news:
    df = pd.read_csv(file)
    #Append the dataframe to the list
    df_list.append(df)

#Use pandas concat function to make all the dataframes into one
ag_news_pd = pd.concat(df_list,ignore_index=True)

#Sanity Check
print(ag_news_pd.head())
print(ag_news_pd.columns)
    


   Class Index                                              Title  \
0            3                  Fears for T N pension after talks   
1            4  The Race is On: Second Private Team Sets Launc...   
2            4      Ky. Company Wins Grant to Study Peptides (AP)   
3            4      Prediction Unit Helps Forecast Wildfires (AP)   
4            4        Calif. Aims to Limit Farm-Related Smog (AP)   

                                         Description  
0  Unions representing workers at Turner   Newall...  
1  SPACE.com - TORONTO, Canada -- A second\team o...  
2  AP - A company founded by a chemistry research...  
3  AP - It's barely dawn when Mike Fitzpatrick st...  
4  AP - Southern California's smog-fighting agenc...  
Index(['Class Index', 'Title', 'Description'], dtype='object')


In [None]:
#Concatenate the two columns (Title and Description)together
ag_news_pd['Text'] = ag_news_pd['Title'].astype(str) + " " + ag_news_pd['Description'].astype(str)

In [None]:
import nltk
from nltk.tokenize import word_tokenize

#This is a function to tokenize the dataset
def word_tokenizer(text):
    #use nltk's word tokenize function
    tokens = word_tokenize(text)
    return tokens


def pad_truncate(text,max_sentence_length=128, pad_token="<pad>"):
    #If the sentence is over 128
    if len(text) > max_sentence_length:
        text = text[:max_sentence_length]
    #If the sentence is less then or equal to 128, pad it it with the pad_token
    return text + [pad_token] * (max_sentence_length - len(text))

#This is where the attention masking occurs
def create_attention_mask(sentence, pad_token="<pad>"):
    #Casual or masked attention(from the textbook)
    mask = []
    for token in sentence:
        if token == pad_token:
            mask.append(0)
        else:
            mask.append(1)
    return mask

#Apply it to every line in the dataset
ag_news_pd['Text'] = ag_news_pd['Text'].apply(word_tokenizer)

#Apply the pad_truncation
ag_news_pd['Text'] = ag_news_pd['Text'].apply(pad_truncate)

#after applying the word_tokenizing and padding/truncating
#Apply the attention masks for the words
ag_news_pd['Attention Mask'] = ag_news_pd['Text'].apply(create_attention_mask)


<class 'pandas.core.frame.DataFrame'>


In [None]:
#Build vocabulary from padded texts (remove <pad> tokens)
from collections import Counter
all_tokens = []
#Go through every token in the Text column
for token_list in ag_news_pd['Text']:
    # Filter out <pad> tokens for vocabulary building
    real_tokens = [t for t in token_list if t != '<pad>']
    all_tokens.extend(real_tokens)

counter = Counter(all_tokens)

# Create vocabulary (Necessary for Transformer) with special tokens first
# 0 for the pad token, 1 for unknown tokens, and 2 for Classification token
vocab = {'<pad>': 0, '<unk>': 1, '[CLS]': 2}
current_id = 3
#Add tokens to the vocab
for token in counter.keys():
    vocab[token] = current_id
    current_id += 1

#Convert the string tokens to IDs for the Transfomer Encoder
def convert_to_ids(token_list):
    ids = []
    #Check to see the tokens look like
    for token in token_list:
        if token == '[CLS]':
            ids.append(2)
        elif token in vocab:
            ids.append(vocab[token])
        else:
            ids.append(1)  # <unk>
    return ids

#Create a new column with input ids form the text column
ag_news_pd['Input IDs'] = ag_news_pd['Text'].apply(convert_to_ids)

In [None]:
#A way to see what the dataframe looks like now
ag_news_pd.head(3)

In [None]:
#Implementing an encoder
from torch.optim import Adam #importing the adam optimizer

class TransformerEncoder:
    def __init__(self, vocab_size, num_classes = 4, d_model = 256, num_heads = 8, num_layers = 4, max_len = 128):
        super().__init__()
        
        #Token Embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        #Positional Encoding
    
    #
    def forward():
        return logits
    
    
    

        

In [None]:
#Split the dataset and convert them into tensors for PyTorch
import sklearn
from sklearn.model_selection import train_test_split

def dataframe_to_tensors(X_df, y_series):
    """
    This converts a pandas dataframe/series (of lists) to a pytorch tensor 
    Args:
        X_df (Pandas dataframe): The dataframe of attention mask and input ids
        y_series (pandas Series): The categories of each news
    """
    input_ids = torch.tensor(X_df['Input IDs'].tolist(), dtype=torch.long)
    attention_mask = torch.tensor(X_df['Attention Mask'].tolist(), dtype=torch.long)
    labels = torch.tensor(y_series.tolist(), dtype=torch.long)
    return input_ids, attention_mask, labels

#But I need to convert the training and test sets to tensors for PyTorch library
#Split the dataframe into a train/test set (80-20)
X = ag_news_pd[["Attention Mask", "Input IDs"]]
y = ag_news_pd["Class Index"]
X_train_df, X_test_df, y_train_series,y_test_series= train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

#Convert to Torch tensor
train_input_ids, train_attention_mask, train_labels = dataframe_to_tensors(X_train_df, y_train_series)
test_input_ids, test_attention_mask, test_labels = dataframe_to_tensors(X_test_df, y_test_series)

In [None]:
#Send in the training to the encoder

In [None]:
#Model's Evaluation

from sklearn import metrics

#Print out the model's accuracy, precision, recall, and f1-score

#Create confusion matrix from sklearn metrics

In [None]:
#Task 2 Setting Up a Transformer Decoder