### Phase 1: Library import

In [1]:
# import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# import dataset
from datasets import load_dataset

# import libraries to clean text
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


# import sklearn and transformers
# Traditional ML
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


# BERT-untrained
from transformers import DistilBertConfig
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

# Bert-trained
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

# Evaluation
from sklearn.metrics import classification_report


### Phase 2: Data Upload

In [3]:
dataset =load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
# take small subset to test models
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(100))

# convert into dataframes for manipulation
small_train_df = pd.DataFrame(small_train_dataset)
small_eval_df = pd.DataFrame(small_eval_dataset)

small_train_df.rename(columns= {"text": "review", "label":"labels"})
small_eval_df.rename(columns= {"text": "review", "label":"labels"})

Unnamed: 0,review,labels
0,<br /><br />When I unsuspectedly rented A Thou...,1
1,This is the latest entry in the long series of...,1
2,This movie was so frustrating. Everything seem...,0
3,"I was truly and wonderfully surprised at ""O' B...",1
4,This movie spends most of its time preaching t...,0
...,...,...
95,I've seen some terrible book-to-film adaptatio...,0
96,I see that C. Thomas Howell has appeared in ma...,0
97,Wow! What a movie if you want to blow your bud...,0
98,There are few films that deal with things that...,1


### Phase 3: Preprocessing

In [None]:
# Traditional ML

# function to remove html tags
def remove_html_tags(review):
    clean_text = re.sub('<.*?>', '', review)
    return clean_text

# function to remove punctuation
def remove_punctuation(review):
    no_punc = review.translate(str.maketrans("","", string.punctuation))
    return no_punc


# function for TF_IDF
def TF_IDF(review):
    tokens = review.lower()
    vectorizer = TfidfVectorizer(max_features = 10000)
    return vectorizer.fit_transform(review)


# make a preprocessing function that cleans reviews and performs TF-IDF
def preprocessing(review):
    preprocess = review['review'].apply(lambda x: remove_html_tags(x))
    preprocess = preprocess.apply(lambda x: remove_punctuation(x))
    preprocess= preprocess.apply(lambda x: TF_IDF(x))
    return preprocess

In [16]:
result = preprocessing(small_train_df)
small_train_df['processed_review'] = result
small_train_df.head()

ValueError: Iterable over raw text documents expected, string object received.

### Phase4: Representation

In [None]:
# distilbert represenation via tokenization
def tokenization(data):
    return tokenizer(data["text"], padding = "max_length", truncation=True, max_length= 512)

### Phase 5: Trainer Setup

In [15]:
# untrained

training_arguments = TrainingArguments(
    output_dir = "../results_scratch",
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
)

trainer = Trainer(
    model = model, # model to be trained
    args = training_arguments, # trainer arguments
    train_dataset = small_train_dataset, # training set
    eval_dataset = small_eval_dataset # evaluation set
)

# Trained

training_arguments = TrainingArguments(
    output_dir = "../results_finetuned",
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
)

trainer = Trainer(
    model = model, # model to be trained
    args = training_arguments, # trainer arguments
    train_dataset = small_train_dataset, # training set
    eval_dataset = small_eval_dataset # evaluation set
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

### Phase 5: Model Construction

In [None]:
# Traditional ML
def trad_ml(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train,y_train)
    return model

# untrained model
def untrained_bert(df_train, df_test):
    config = DistilBertConfig(num_labels=2) # confiugre untrained model 2 labels
    model = DistilBertForSequenceClassification(config) # add configuration to untrained model
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") # Tokenization method

    def tokenization(dataset):
        return tokenizer(dataset["text"], padding = "max_length", truncation=True, max_length= 512)
    
    # map the tokenized datasets
    dataset_train = dataset_train.map(tokenization, batched=True)
    dataset_test = dataset_test.map(tokenization, batched=True)


    # setup the training Parameters
    training_arguments = TrainingArguments(
    output_dir = "./results_scratch",
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
    )

    # Create trainer object
    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = small_train_dataset, # training set
        eval_dataset = small_eval_dataset # evaluation set
    )

    trainer.train() # Execute training session(s)

    return model, tokenizer # return model and tokenized data for evaluation


    
# pretrained model
def pretrained_bert(dataset_train, dataset_test):
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    def tokenization(dataset):
        return tokenizer(dataset["text"], padding = "max_length", truncation=True)
    
    # map the tokenized datasets
    dataset_train = dataset_train.map(tokenization, batched=True)
    dataset_test = dataset_test.map(tokenization, batched=True)
    
    # setup training Parameters
    training_arguments = TrainingArguments(
    output_dir = "./results_finetuned",
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10
    )

    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = small_train_dataset, # training set
        eval_dataset = small_eval_dataset # evaluation set
    )
    trainer.train() # Execute training session(s)
    return model, tokenizer # return model and tokenizer for evaluation

### Phase 6: Evaluation

In [None]:
# setup evaluation fucntion
def evaluate_model(model, X_test, y_test, model_type, tokenizer = none):
    if model_type == "tradditioanl":
        X = [preprocessing(x) for x in X_test]
        y_pred = model.preprocessing(X)
    else:
        inputs = tokenizer(list(X_test), padding = True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        y_pred = np.argmax(outputs.logits.detach().numpy(), axis=1)
    
    print(classification_report(y_test,y_pred))


### Phase7: Execution

In [None]:
 # Train traditional model
model_trad = trad_ml(X_train, y_train)
evaluate_model(model_trad, X_test, y_test, model_type="traditional")

# Train DNN from scratch
model_scratch, tokenizer_scratch = untrained_bert(small_train_df, small_test_df)
evaluate_model(model_scratch, X_test, y_test, model_type="transformer", tokenizer=tokenizer_scratch)

# Train fine-tuned pretrained model
model_finetuned, tokenizer_finetuned = pretrained_bert(small_train_df, small_test_df)
evaluate_model(model_finetuned, X_test, y_test, model_type="transformer", tokenizer=tokenizer_finetuned)