In [1]:
import os.path as op

import numpy as np
import pandas as pd
import torch
import lightning as L
from lightning.pytorch.loggers import CSVLogger

from tools import download_dataset, load_dataset_into_to_dataframe, partition_dataset, plot_csv_logger, LightningModel

## Load dataset

In [2]:
download_dataset()

In [3]:
df = load_dataset_into_to_dataframe()

df.head()

100%|██████████| 50000/50000 [01:39<00:00, 502.33it/s] 

Class distribution:





Unnamed: 0,text,label
0,I went and saw this movie last night after bei...,1
0,Actor turned director Bill Paxton follows up h...,1
0,As a recreational golfer with some knowledge o...,1
0,"I saw this film in a sneak preview, and it is ...",1
0,Bill Paxton has taken the true story of the 19...,1


In [4]:
partition_dataset(df)

In [5]:
df_training = pd.read_csv("train.csv")
df_training.tail()

Unnamed: 0,index,text,label
34995,0,Frank Capra's creativity must have been just a...,0
34996,0,Just saw the film tonight in a preview and it'...,0
34997,0,"If you love Japanese monster movies, you'll lo...",1
34998,0,Because it came from HBO and based on the IMDb...,0
34999,0,"WARNING!!! SOME POSSIBLE PLOT SPOILERS, AS IF ...",0


In [6]:
np.bincount(df_training["label"])

array([17452, 17548], dtype=int64)

In [7]:
df_val = pd.read_csv("val.csv")
df_val.tail()

Unnamed: 0,index,text,label
4995,0,The Matador is a strange film. Its main charac...,1
4996,0,Not bad performances. Whoopi plays the wise/wa...,0
4997,0,I was surprised when I saw this film. I'd hear...,0
4998,0,When great director/actor combinations are tal...,0
4999,0,This show is non Stop hilarity. the first joke...,1


In [8]:
np.bincount(df_val["label"])

array([2542, 2458], dtype=int64)

In [9]:
df_test = pd.read_csv("test.csv")
df_test.tail()

Unnamed: 0,index,text,label
9995,0,Every generation fully believes it is living i...,0
9996,0,Possibly the most brilliant thing about Che: P...,1
9997,0,I was unsure of this movie before renting and ...,1
9998,0,"Just got out of an advance screening, and wow ...",1
9999,0,I sense out there a mix of confusion and varyi...,1


In [10]:
np.bincount(df_test["label"])

array([5006, 4994], dtype=int64)

## Bag of Words model

In [11]:
!pip install scikit-learn





In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
count_vect = CountVectorizer(
    lowercase=True,
    max_features=10000,
    stop_words="english"
)

In [14]:
count_vect.fit(df_training["text"])

In [15]:
count_vect.vocabulary_

{'started': 8515,
 'watching': 9725,
 'series': 7957,
 'cable': 1320,
 'idea': 4488,
 'hate': 4191,
 'character': 1544,
 'hold': 4339,
 'beautifully': 892,
 'developed': 2574,
 'understand': 9375,
 'react': 7196,
 'frustration': 3737,
 'fear': 3439,
 'greed': 4020,
 'temptation': 8974,
 'way': 9736,
 'viewer': 9574,
 'experiencing': 3280,
 'christopher': 1656,
 'learning': 5199,
 'br': 1151,
 'abuse': 188,
 'physically': 6608,
 'emotionally': 3046,
 'just': 4963,
 'read': 7199,
 'newspaper': 6088,
 'women': 9880,
 'tolerate': 9134,
 'behavior': 915,
 'dream': 2831,
 'house': 4418,
 'endless': 3074,
 'supply': 8779,
 'expensive': 3276,
 'things': 9036,
 'sure': 8791,
 'loving': 5426,
 'faithful': 3371,
 'husband': 4465,
 'maybe': 5640,
 'watch': 9719,
 'doesn': 2754,
 'matter': 5630,
 'times': 9104,
 'episode': 3140,
 'missed': 5813,
 'episodes': 3141,
 'sequence': 7950,
 'season': 7869,
 'late': 5151,
 'night': 6101,
 'commercials': 1874,
 'language': 5133,
 'reruns': 7427,
 'movie': 5

In [16]:
X_train = count_vect.transform(df_training["text"])
X_val = count_vect.transform(df_val["text"])
X_test = count_vect.transform(df_test["text"])

In [29]:
X_train.shape

(35000, 10000)

In [17]:
fc_vec = np.array(X_train[0].todense())[0]
fc_vec.shape

(10000,)

In [18]:
np.bincount(fc_vec)

array([9926,   67,    5,    0,    1,    0,    1], dtype=int64)

## Custom dataloader

In [19]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):

    def __init__(self, X, y):
        self._features = torch.tensor(X, dtype=torch.float32)
        self._labels = torch.tensor(y, dtype=torch.int64)

    def __getitem__(self, indx):
        x = self._features[indx]
        y = self._labels[indx]

        return x, y

    def __len__(self):
        return self._labels.shape[0]



In [20]:
train_ds = TextDataset(X_train.todense(), df_training["label"].values)

train_dl = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
)

In [21]:
val_ds = TextDataset(X_val.todense(), df_val["label"].values)

val_dl = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=True,
)

In [22]:
test_ds = TextDataset(X_test.todense(), df_test["label"].values)

test_dl = DataLoader(
    dataset=test_ds,
    batch_size=32,
    shuffle=True,
)

In [23]:
for batch_idx, (features, class_labels) in enumerate(train_dl):
    break

## BoW Classifier

In [24]:
class LogisticRegression(torch.nn.Module):

    def __init__(self, num_features, num_classes):
        super().__init__()
        self._linear = torch.nn.Linear(num_features, num_classes)

    def forward(self, x):
        logits = self._linear(x)
        return logits


pt_model = LogisticRegression(num_features=10000, num_classes=2)

In [25]:
from lightning.pytorch.callbacks import ModelCheckpoint

callbacks = [
    ModelCheckpoint(save_top_k=1, mode="max", monitor="val_acc", save_last=True)
]

In [26]:
lt_model = LightningModel(model=pt_model, learning_rate=0.01)

In [27]:
trainer = L.Trainer(
    callbacks=callbacks,
    max_epochs=20,
    accelerator="auto",
    logger=CSVLogger(save_dir="logs/", name="lt_model"),
    deterministic=True
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [30]:
trainer.fit(model=lt_model, train_dataloaders=train_dl, val_dataloaders=val_dl)

D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\callbacks\model_checkpoint.py:630: Checkpoint directory logs/lt_model\version_0\checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params
-------------------------------------------------
0 | model     | LogisticRegression | 20.0 K
1 | train_acc | MulticlassAccuracy | 0     
2 | val_acc   | MulticlassAccuracy | 0     
3 | test_acc  | MulticlassAccuracy | 0     
-------------------------------------------------
20.0 K    Trainable params
0         Non-trainable params
20.0 K    Total params
0.080     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
D:\Programming Train\dl_lightning_ai_course\venv\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

OSError: [WinError 1314] Клиент не обладает требуемыми правами: 'logs/lt_model\\version_0\\checkpoints\\epoch=1-step=2188.ckpt' -> 'logs/lt_model\\version_0\\checkpoints\\last.ckpt'

In [None]:
plot_csv_logger(csv_path=f"{trainer.logger.log_dir}/metrics.csv")

In [None]:
trainer.test(model=lt_model, dataloaders=test_dl)