In [None]:
# !pip install transformers datasets pytorch-lightning

In [None]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
import os
import tarfile

from datasets import load_dataset

import torch
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [None]:
target = "aclImdb_v1.tar.gz"

with tarfile.open(target, "r:gz") as tar:
    tar.extractall()

In [None]:
# download_dataset()

df = load_dataset_into_to_dataframe()
partition_dataset(df)

100%|██████████| 50000/50000 [01:02<00:00, 806.32it/s]


Class distribution:


In [None]:
df_train = pd.read_csv("train.csv")
df_val = pd.read_csv("val.csv")
df_test = pd.read_csv("test.csv")

In [None]:
imdb_dataset = load_dataset(
    "csv",
    data_files={
        "train": "train.csv",
        "validation": "val.csv",
        "test": "test.csv",
    },
)

print(imdb_dataset)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-bb0f0672e6e8abb7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-bb0f0672e6e8abb7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


In [None]:
imdb_dataset["train"]

Dataset({
    features: ['index', 'text', 'label'],
    num_rows: 35000
})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
print("Tokenizer max length : ", tokenizer.model_max_length)
print("Tokenizer max vocab : ", tokenizer.vocab_size)

Tokenizer max length :  512
Tokenizer max vocab :  30522


In [None]:
tokenizer(["Hi how are you", "Bye guys"], truncation=True)

{'input_ids': [[101, 7632, 6616, 2125, 2317, 102], [101, 9061, 4364, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1]]}

In [None]:
tokenizer.decode([101])

'[CLS]'

In [None]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

imdb_tokenized = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
del imdb_dataset

In [None]:
imdb_tokenized

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [None]:
from transformers import AutoModel

model = AutoModel.from_pretrained('distilbert-base-uncased')
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [None]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
test_batch = {"attention_mask": imdb_tokenized["train"][:1]["attention_mask"].to(device),
              "input_ids": imdb_tokenized["train"][:1]["input_ids"].to(device)}

with torch.inference_mode():
    test_output = model(**test_batch)

test_output.last_hidden_state.shape

torch.Size([1, 512, 768])

In [None]:
test_output.last_hidden_state

tensor([[[ 0.1227, -0.2231, -0.0664,  ..., -0.0066,  0.5479,  0.3654],
         [-0.3743, -0.1010,  0.2460,  ..., -0.2892,  0.1966,  0.3806],
         [ 0.2883, -0.2648, -0.0105,  ..., -0.2062,  0.8071, -0.4093],
         ...,
         [ 0.3408, -0.2641,  0.3494,  ...,  0.0966,  0.1805, -0.4711],
         [ 0.2999, -0.4438,  0.3791,  ...,  0.1629,  0.1077, -0.5215],
         [ 0.1003, -0.2132,  0.2327,  ...,  0.3833,  0.0850, -0.3468]]])

In [None]:
test_output.last_hidden_state[:, 0].shape

torch.Size([1, 768])

In [None]:
@torch.inference_mode()
def get_output_embeddings(batch):
    output = model(
        batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device)).last_hidden_state[:, 0]

    return {"features": output.cpu().numpy()}

In [None]:
import time
start = time.time()

imdb_features = imdb_tokenized.map(get_output_embeddings, batched=True, batch_size=10)

print("Time Taken : ", time.time() - start)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Time Taken :  779.4998204708099


In [None]:
imdb_features

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask', 'features'],
        num_rows: 35000
    })
    validation: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask', 'features'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask', 'features'],
        num_rows: 10000
    })
})

In [None]:
X_train = np.array(imdb_features["train"]["features"])
y_train = np.array(imdb_features["train"]["label"])

X_val = np.array(imdb_features["validation"]["features"])
y_val = np.array(imdb_features["validation"]["label"])

X_test = np.array(imdb_features["test"]["features"])
y_test = np.array(imdb_features["test"]["label"])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# clf = LogisticRegression(max_iter=1000)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

print("Training accuracy", clf.score(X_train, y_train))
print("Validation accuracy", clf.score(X_val, y_val))
print("test accuracy", clf.score(X_test, y_test))

Training accuracy 1.0
Validation accuracy 0.8408
test accuracy 0.8324


In [None]:
"""
Training accuracy 0.8867714285714285
Validation accuracy 0.8834
test accuracy 0.8794


Training accuracy 1.0
Validation accuracy 0.8408
test accuracy 0.8324

"""

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = ''.join([c for c in text if c.isalpha() or c.isspace()])
    # Tokenization and Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in set(stopwords.words('english'))])
    return text

df_train['preprocessed_review'] = df_train['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df_train

Unnamed: 0,index,text,label,preprocessed_review
0,0,"When we started watching this series on cable,...",1,started watching series cable idea addictive w...
1,0,Steve Biko was a black activist who tried to r...,1,steve biko black activist tried resist white m...
2,0,My short comment for this flick is go pick it ...,1,short comment flick go pick chance going posit...
3,0,"As a serious horror fan, I get that certain ma...",0,serious horror fan get certain marketing ploy ...
4,0,"Robert Cummings, Laraine Day and Jean Muir sta...",1,robert cummings laraine day jean muir star one...
...,...,...,...,...
34995,0,Frank Capra's creativity must have been just a...,0,frank capra creativity must spent time made fi...
34996,0,Just saw the film tonight in a preview and it'...,0,saw film tonight preview film kid improve add ...
34997,0,"If you love Japanese monster movies, you'll lo...",1,love japanese monster movie youll love action ...
34998,0,Because it came from HBO and based on the IMDb...,0,came hbo based imdb rating watched first seaso...


In [None]:
x_tr = df_train['preprocessed_review']

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(x_tr)


y_tr = df_train['label']



Training accuracy 1.0


In [None]:
# clf = RandomForestClassifier()
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_tr)

print("Training accuracy", clf.score(X_train_tfidf, y_tr))

Training accuracy 0.9102
