### Phase 1: Library import

In [None]:
# import basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random

# import dataset
from datasets import load_dataset

# import libraries to clean text
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


# import sklearn and transformers
# Traditional ML
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


# BERT-untrained
from transformers import DistilBertConfig
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

# Bert-trained
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

### Phase 2: Data Upload

In [2]:
dataset =load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
# take small subset to test models
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(100))

In [None]:
# visualization of 
def plot_eda(review):
    # Class distribution
    sns.countplot(x='label', data=review)
    plt.title('Class Distribution')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()

     # Review length distribution
    small_train_dataset['text_length'] = small_train_dataset['text'].apply(lambda x: len(x))
    sns.histplot(small_eval_dataset['text_length'], bins=50)
    plt.title('Review Length Distribution')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.show()

### Phase 3: Preprocessing

In [None]:
# Traditional ML

# function to remove html tags
def remove_html_tags(review):
    clean_text = re.sub('<.*?>', '', review)
    return clean_text

# function to remove punctuation
def remove_punctuation(review):
    no_punc = review.translate(str.maketrans("","", string.punctuation))
    return no_punc


# function for TF_IDF
def TF_IDF(review):
    tokens = review.lower()
    vectorizer = TfidfVectorizer(max_features = 10000)
    return vectorizer.fit_transform(review)


# make a preprocessing function that cleans reviews and performs TF-IDF
def preprocessing(review):
    preprocess = review['text'].apply(lambda x: remove_html_tags(x))
    preprocess = preprocess.apply(lambda x: remove_punctuation(x))
    preprocess= preprocess.apply(lambda x: TF_IDF(x))
    return preprocess

In [None]:

# BERT-untrained preprocessing
def tokenize_untrained():
    return tokenizer(small_train_dataset['text'], padding="max_length", truncation=True, max_length=512)

# DistilBERT-trained preprocessing
def tokenize_trained():
    return tokenizer(small_train_dataset['text'], padding="max_length", truncation=True, max_length=512)

### Phase 4: Trainer Setup

In [None]:
# untrained

training_arguments = TrainingArguments(
    output_dir = "./results_scratch"
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
)

trainer = Trainer(
    model = model, # model to be trained
    args = training_arguments, # trainer arguments
    train_dataset = small_train_dataset, # training set
    eval_dataset = small_eval_dataset # evaluation set
)

# Trained

training_arguments = TrainingArguments(
    output_dir = "./results_finetuned"
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
)

trainer = Trainer(
    model = model, # model to be trained
    args = training_arguments, # trainer arguments
    train_dataset = small_train_dataset, # training set
    eval_dataset = small_eval_dataset # evaluation set
)

### Phase 5: Model Construction

In [None]:
# Traditional ML
def trad_ML(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train,y_train)
    return model

# untrained model
def untrained_bert(dataset_train, dataset_test):
    config = DistilBertConfig(num_labels=2) # confiugre untrained model 2 labels
    model = DistilBertForSequenceClassification(config) # add configuration to untrained model
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") # Tokenization method

    def tokenization(dataset):
        return tokenizer(dataset["text"], padding = "max_length", truncation=True)
    


    # setup the training Parameters
    training_arguments = TrainingArguments(
    output_dir = "./results_scratch"
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
    )

    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = small_train_dataset, # training set
        eval_dataset = small_eval_dataset # evaluation set
    )

    return model, tokenizer


    
# pretrained model
def trained_bert(dataset_train, dataset_test):
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    
    # setup training Parameters
    training_arguments = TrainingArguments(
    output_dir = "./results_finetuned"
    evaluation_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
    )

    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = small_train_dataset, # training set
        eval_dataset = small_eval_dataset # evaluation set
    )

    return model, tokenizer



### Phase 6: Evaluation