In [None]:
# pip installations
#!pip install transformers torch
#!pip install --upgrade datasets

In [None]:
# import libraries

# import dataset
from datasets import load_dataset
from datasets import Dataset

# import the essentials
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# import libraries to clean text
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


# import sklearn and transformers
# Traditional ML
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


# BERT-untrained
from transformers import DistilBertConfig
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

# Bert-trained
from transformers import DistilBertForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from transformers import DistilBertTokenizerFast

# Evaluation
from sklearn.metrics import classification_report
#from evaluation import evaluate_model

# access secret key
from google.colab import userdata
userdata.get('LHL_NLP')


#import notebook_login
from huggingface_hub import notebook_login
notebook_login()

'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'

Load dataset

In [None]:
dataset =load_dataset("imdb")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
#convert datasets into dataframes
import pandas as pd

df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])
df_total =pd.concat([df_train, df_test], axis = 0)
df_total.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


Classic Machine Learning

In [None]:
# Preprocessing for machine learning

# function to remove html tags
def remove_html_tags(df):
    clean_text = re.sub('<.*?>', '', df)
    return clean_text

# function to remove punctuation
def remove_punctuation(df):
    no_punc = df.translate(str.maketrans("","", string.punctuation))
    return no_punc


# function for TF_IDF
def TF_IDF(df):
    tokens = df.str.lower()
    vectorizer = TfidfVectorizer(max_features = 10000)
    return vectorizer.fit_transform(tokens)


# make a preprocessing function that cleans reviews and performs TF-IDF
def preprocessing(df):
    preprocess = df['text'].apply(lambda x: remove_html_tags(x))
    preprocess = preprocess.apply(lambda x: remove_punctuation(x))
    preprocess= TF_IDF(preprocess)
    return preprocess

In [None]:
dataset_TF_IDF  = preprocessing(df_total)

In [None]:
dataset_TF_IDF.shape

(50000, 10000)

In [None]:
dataset_train = dataset_TF_IDF[:25000]
dataset_test = dataset_TF_IDF[25000:]

In [None]:
# Traditional ML
def trad_ml(X_train, y_train):
    model = LogisticRegression()
    model.fit(X_train,y_train)
    return model


In [None]:
# traditional machine learning via logistic regression
import numpy as np

X_train = np.asarray(dataset_train.todense())
y_train = df_total['label'].iloc[:25000]
model_trad = trad_ml(X_train,y_train)

In [None]:
# setup evaluation fucntion
def evaluate_model(model, df, model_type, tokenizer = None):
    X = df['text']
    y = df['label']
    if model_type == "traditional":
        X = preprocessing(df)
        X = np.asarray(X.todense())
        y_pred = model.predict(X)
    else:
        inputs = tokenizer(list(X), padding = True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        y_pred = np.argmax(outputs.logits.detach().numpy(), axis=1)

    print(classification_report(y, y_pred))

In [None]:
# evaluate model
evaluate_model(model_trad, df_total.iloc[25000:, :], model_type="traditional" )

              precision    recall  f1-score   support

           0       0.52      0.66      0.58     12500
           1       0.54      0.39      0.46     12500

    accuracy                           0.53     25000
   macro avg       0.53      0.53      0.52     25000
weighted avg       0.53      0.53      0.52     25000



In [None]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(100))

# convert into dataframes for manipulation
small_train_df = pd.DataFrame(small_train_dataset)
small_eval_df = pd.DataFrame(small_eval_dataset)

LLM Models

In [None]:
# untrained model
def untrained_bert(df_train, df_test):
    config = DistilBertConfig(num_labels=2) # confiugre untrained model 2 labels
    model = DistilBertForSequenceClassification(config) # add configuration to untrained model
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") # Tokenization method

    # tokenizer
    def tokenization(dataset):
      return tokenizer(dataset['text'], padding = "max_length", truncation=True, max_length= 512)

    #convert dataframes back into datasets hugging face
    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_test)

    # map the tokenized datasets
    dataset_train_tokenized = dataset_train.map(tokenization, batched = True)
    dataset_test_tokenized = dataset_test.map(tokenization, batched = True)


    # setup the training Parameters
    repo_name = "IMDB_Sentiment_Analysis"
    training_arguments = TrainingArguments(
    output_dir = repo_name,
    eval_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10,
    )

    # Create trainer object
    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = dataset_train_tokenized, # training set
        eval_dataset = dataset_test_tokenized # evaluation set
    )

    trainer.train() # Execute training session(s)
    trainer.evaluate() # Evaluates the training results
    trainer.push_to_hub() #push to github

    return model, tokenizer # return model and tokenized data for evaluation



# pretrained model
def pretrained_bert(df_train, df_test):
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    # tokenizer
    def tokenization(dataset):
      return tokenizer(dataset['text'], padding = "max_length", truncation=True, max_length= 512)

    # convert dataframes back into datasets
    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_test)

    # map the tokenized datasets
    dataset_train_tokenized = dataset_train.map(tokenization, batched = True)
    dataset_test_tokenized = dataset_test.map(tokenization, batched = True)

    # setup training Parameters
    repo_name = "IMDB_Sentiment_Analysis"
    training_arguments = TrainingArguments(
    output_dir = repo_name,
    eval_strategy = "epoch",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    logging_dir = "./logs",
    logging_steps = 10
    )

    trainer = Trainer(
        model = model, # model to be trained
        args = training_arguments, # trainer arguments
        train_dataset = dataset_train_tokenized, # training set
        eval_dataset = dataset_test_tokenized # evaluation set
    )
    trainer.train() # Execute training session(s)
    trainer.evaluate() # Evaluates the training results
    trainer.push_to_hub() #push to github
    return model, tokenizer # return model and tokenizer for evaluation

In [None]:
# take small subset to test models
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(100))

# convert into dataframes for manipulation
small_train_df = pd.DataFrame(small_train_dataset)
small_eval_df = pd.DataFrame(small_eval_dataset)


In [None]:
# Train DNN from scratch
model_scratch, tokenizer_scratch = untrained_bert(small_train_df, small_eval_df)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfmanuana[0m ([33mfmanuana-lighthouse-labs[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.767317
2,0.887900,0.707218
3,0.723800,0.691291


In [None]:
# evaluate DNN from scratch
evaluate_model(model_scratch, small_eval_df, model_type="transformer", tokenizer= tokenizer_scratch)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.53      1.00      0.69        53
           1       0.00      0.00      0.00        47

    accuracy                           0.53       100
   macro avg       0.27      0.50      0.35       100
weighted avg       0.28      0.53      0.37       100



In [None]:
# Train fine-tuned pretrained model
model_finetuned, tokenizer_finetuned = pretrained_bert(small_train_df, small_eval_df)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.683446
2,0.688700,0.663391
3,0.630500,0.642359


In [None]:
evaluate_model(model_finetuned, small_eval_df, model_type="transformer", tokenizer=tokenizer_finetuned)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81        53
           1       0.88      0.60      0.71        47

    accuracy                           0.77       100
   macro avg       0.80      0.76      0.76       100
weighted avg       0.79      0.77      0.76       100



In [None]:
# allow anyone to input data into the pipeline
from transformers import pipeline
data = [list of text data you want to predict]
my_model = pipeline(model="FranklinManuana/IMDB_Sentiment_Analysis")
my_model(data)