## Core Libraries

In [2]:
!pip uninstall -y numpy 
!pip install numpy==1.26.4 nltk --no-cache-dir
!pip install spacy stanza textblob
import nltk

!python -m spacy download en_core_web_sm

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m128.5 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m310.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.3 which is incompatible.


In [3]:
import nltk
import stanza
import spacy
from textblob import TextBlob

In [4]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron

True

In [5]:
# text example
text = "Apple is looking at buying U.K. startup for $1 billion"

In [6]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

print("\n=== spaCy Example ===")
print("Tokens:", [token.text for token in doc])
print("Named Entities:", [(ent.text, ent.label_) for ent in doc.ents])
print("Part-of-Speech Tags:", [(token.text, token.pos_) for token in doc])


=== spaCy Example ===
Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion']
Named Entities: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]
Part-of-Speech Tags: [('Apple', 'PROPN'), ('is', 'AUX'), ('looking', 'VERB'), ('at', 'ADP'), ('buying', 'VERB'), ('U.K.', 'PROPN'), ('startup', 'VERB'), ('for', 'ADP'), ('$', 'SYM'), ('1', 'NUM'), ('billion', 'NUM')]


In [8]:
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.sentiment import SentimentIntensityAnalyzer
text = "Apple is looking at buying U.K. startup for $1 billion, I love it"
tokens = word_tokenize(text)
tags = pos_tag(tokens)
ner_tree = ne_chunk(tags)
print("\n=== NLTK Example ===")
print("Tokens:", tokens)
print("POS Tags:", tags)
print("Named Entity Chunk Tree:")
print(ner_tree)
sia = SentimentIntensityAnalyzer()
print("Sentiment:", sia.polarity_scores(text))


=== NLTK Example ===
Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', ',', 'I', 'love', 'it']
POS Tags: [('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('at', 'IN'), ('buying', 'VBG'), ('U.K.', 'NNP'), ('startup', 'NN'), ('for', 'IN'), ('$', '$'), ('1', 'CD'), ('billion', 'CD'), (',', ','), ('I', 'PRP'), ('love', 'VBP'), ('it', 'PRP')]
Named Entity Chunk Tree:
(S
  (GPE Apple/NNP)
  is/VBZ
  looking/VBG
  at/IN
  buying/VBG
  U.K./NNP
  startup/NN
  for/IN
  $/$
  1/CD
  billion/CD
  ,/,
  I/PRP
  love/VBP
  it/PRP)
Sentiment: {'neg': 0.0, 'neu': 0.724, 'pos': 0.276, 'compound': 0.6369}


In [9]:
blob = TextBlob(text)

print("\n=== TextBlob Example ===")
print("Tokens:", blob.words)
print("POS Tags:", blob.tags)
print("Sentiment (polarity, subjectivity):", blob.sentiment)



=== TextBlob Example ===
Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K', 'startup', 'for', '1', 'billion', 'I', 'love', 'it']
POS Tags: [('Apple', 'NNP'), ('is', 'VBZ'), ('looking', 'VBG'), ('at', 'IN'), ('buying', 'VBG'), ('U.K.', 'NNP'), ('startup', 'NN'), ('for', 'IN'), ('1', 'CD'), ('billion', 'CD'), ('I', 'PRP'), ('love', 'VBP'), ('it', 'PRP')]
Sentiment (polarity, subjectivity): Sentiment(polarity=0.5, subjectivity=0.6)


## CUPY 

In [1]:
import cupy as cp
import numpy as np
import time

In [12]:
# on cpu 

n = 1_000_000
x_cpu = np.random.rand(n)
y_cpu = np.random.rand(n)

start = time.time()
cpu_result = np.sqrt(x_cpu ** 2 + y_cpu ** 2)
cpu_time = time.time() - start
print(f"CPU time: {cpu_time:.4f} seconds")

CPU time: 0.0081 seconds


In [13]:
x_gpu = cp.asarray(x_cpu)
y_gpu = cp.asarray(y_cpu)

cp.cuda.Stream.null.synchronize() 

start = time.time()
gpu_result = cp.sqrt(x_gpu ** 2 + y_gpu ** 2)

cp.cuda.Stream.null.synchronize() 
gpu_time = time.time() - start
print(f"GPU time: {gpu_time:.4f} seconds")

GPU time: 0.0007 seconds


In [15]:
result_back_to_cpu = cp.asnumpy(gpu_result)
print("Results match:", np.allclose(cpu_result, result_back_to_cpu))
print(f"Speedup: {cpu_time / gpu_time:.2f}x faster on GPU!")

Results match: True
Speedup: 11.61x faster on GPU!


### Exercise: what do you expect would happen if we increase n?

# Pytorch

In [1]:
!pip install --upgrade torch torchtext --index-url https://download.pytorch.org/whl/cu126

Looking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch
  Downloading https://download.pytorch.org/whl/cu126/torch-2.8.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchtext
  Downloading https://download.pytorch.org/whl/torchtext-0.17.0%2Bcpu-cp311-cp311-linux_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading https://download.pytorch.org/whl/cu126/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading https://download.pytorch.org/whl/cu126/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [3]:
import torch
from torch import nn
from torchtext.data.utils import get_tokenizer
from collections import Counter

In [4]:
#  Dataset
train_data = [
    ("I love this movie", 1),
    ("This film was great", 1),
    ("Amazing performance and good story", 1),
    ("Absolutely fantastic movie", 1),
    ("Bad movie, I hated it", 0),
    ("Terrible plot and poor acting", 0),
    ("Worst film I have ever seen", 0),
    ("I did not like this movie", 0),
]
test_data = [
    ("Good acting and nice story", 1),
    ("I really disliked the film", 0),
]

In [6]:
# load tokenizer from torchtext
tokenizer = get_tokenizer("basic_english")

# counter is a dictionary that counts words, so its keys represent a unique set of words
counter = Counter()
for text, _ in train_data:
    counter.update(tokenizer(text))

# Map words to IDs
itos = ["<unk>"] + sorted(counter.keys())
print(itos)
stoi = {word: i for i, word in enumerate(itos)}
print(stoi)
def encode(text):
    tokens = tokenizer(text)
    return torch.tensor([stoi.get(t, 0) for t in tokens], dtype=torch.long)

print("✅ Vocabulary size:", len(stoi))
print("🗒️ Sample vocab:", list(stoi.items())[:10])

['<unk>', ',', 'absolutely', 'acting', 'amazing', 'and', 'bad', 'did', 'ever', 'fantastic', 'film', 'good', 'great', 'hated', 'have', 'i', 'it', 'like', 'love', 'movie', 'not', 'performance', 'plot', 'poor', 'seen', 'story', 'terrible', 'this', 'was', 'worst']
{'<unk>': 0, ',': 1, 'absolutely': 2, 'acting': 3, 'amazing': 4, 'and': 5, 'bad': 6, 'did': 7, 'ever': 8, 'fantastic': 9, 'film': 10, 'good': 11, 'great': 12, 'hated': 13, 'have': 14, 'i': 15, 'it': 16, 'like': 17, 'love': 18, 'movie': 19, 'not': 20, 'performance': 21, 'plot': 22, 'poor': 23, 'seen': 24, 'story': 25, 'terrible': 26, 'this': 27, 'was': 28, 'worst': 29}
✅ Vocabulary size: 30
🗒️ Sample vocab: [('<unk>', 0), (',', 1), ('absolutely', 2), ('acting', 3), ('amazing', 4), ('and', 5), ('bad', 6), ('did', 7), ('ever', 8), ('fantastic', 9)]


In [7]:
# simplest model ever
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, 2)
        #self.act = nn.Tanh()
        #self.act = nn.Sigmoid()
        #self.act = nn.Relu()
        
    def forward(self, x):
        embedded = self.embedding(x)
        pooled = embedded.mean(dim=0)
        return self.fc(pooled)

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentimentModel(len(stoi), embed_dim=16).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(30):
    total_loss = 0
    for text, label in train_data:
        x = encode(text).to(device)
        y = torch.tensor([label]).to(device)
        optimizer.zero_grad()
        pred = model(x)
        loss = criterion(pred.unsqueeze(0), y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")

Epoch 1: Loss = 6.1441
Epoch 2: Loss = 5.1717
Epoch 3: Loss = 4.5803
Epoch 4: Loss = 4.0272
Epoch 5: Loss = 3.4839
Epoch 6: Loss = 2.9501
Epoch 7: Loss = 2.4414
Epoch 8: Loss = 1.9795
Epoch 9: Loss = 1.5824
Epoch 10: Loss = 1.2575
Epoch 11: Loss = 1.0014
Epoch 12: Loss = 0.8042
Epoch 13: Loss = 0.6537
Epoch 14: Loss = 0.5387
Epoch 15: Loss = 0.4503
Epoch 16: Loss = 0.3814
Epoch 17: Loss = 0.3271
Epoch 18: Loss = 0.2837
Epoch 19: Loss = 0.2484
Epoch 20: Loss = 0.2195
Epoch 21: Loss = 0.1954
Epoch 22: Loss = 0.1752
Epoch 23: Loss = 0.1581
Epoch 24: Loss = 0.1434
Epoch 25: Loss = 0.1307
Epoch 26: Loss = 0.1197
Epoch 27: Loss = 0.1100
Epoch 28: Loss = 0.1015
Epoch 29: Loss = 0.0940
Epoch 30: Loss = 0.0873


In [17]:
# eval
model.eval()
with torch.no_grad():
    for text, true_label in test_data:
        x = encode(text).to(device)
        output = model(x)
        pred = torch.argmax(output).item()
        print(f"{text!r} → Prediction: {'Positive ' if pred==1 else 'Negative '} (True: {true_label})")


'Good acting and nice story' → Prediction: Positive  (True: 1)
'I really disliked the film' → Prediction: Negative  (True: 0)


### Exercise, use an actual dataset instead of the dummy dataset given here and try to see how many epochs it would take to converge

## Datasets and Dataloaders

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

np.random.seed(0)
x = np.linspace(0, 10, 100)
y = 2 * x + 1 + np.random.normal(0, 1, size=x.shape)
df = pd.DataFrame({"x": x, "y": y})

# 2️⃣ Create a custom Dataset that reads from the DataFrame
class LinearDFDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = torch.tensor(self.df.iloc[idx]["x"], dtype=torch.float32).unsqueeze(0)
        y = torch.tensor(self.df.iloc[idx]["y"], dtype=torch.float32).unsqueeze(0)
        return x, y

# 3️⃣ Create dataset and dataloader
dataset = LinearDFDataset(df)
dataloader = DataLoader(dataset, batch_size=10, shuffle=True)

# 4️⃣ Iterate through the dataloader
for batch_idx, (x_batch, y_batch) in enumerate(dataloader):
    print(f"Batch {batch_idx+1}:")
    print("x:", x_batch.flatten())
    print("y:", y_batch.flatten())
    print()


Batch 1:
x: tensor([7.1717, 6.9697, 5.9596, 1.8182, 2.2222, 5.4545, 5.6566, 9.6970, 3.0303,
        8.5859])
y: tensor([15.4724, 14.9913, 12.5565,  4.9494,  6.3089, 11.8809, 12.3796, 20.4044,
         7.2156, 20.0676])

Batch 2:
x: tensor([8.2828, 2.3232, 9.1919, 0.0000, 6.0606, 6.8687, 6.4646, 1.4141, 3.1313,
        9.0909])
y: tensor([18.0313,  4.9043, 20.6063,  2.7641, 12.4488, 13.8301, 14.1067,  4.2721,
         7.6408, 18.7786])

Batch 3:
x: tensor([5.7576, 9.4949, 2.5253, 4.1414, 2.9293, 8.0808, 9.5960, 9.8990, 9.7980,
        7.4747])
y: tensor([12.8176, 20.3463,  4.5961,  7.8628,  8.3279, 15.9965, 20.8985, 20.9249,
        22.3818, 16.3518])

Batch 4:
x: tensor([ 2.6263,  4.4444, 10.0000,  0.8081,  6.6667,  0.7071,  9.2929,  4.9495,
         7.3737,  4.6465])
y: tensor([ 6.2983,  9.3792, 21.4020,  2.5129, 12.7031,  2.2628, 19.7941, 10.6862,
        14.5126,  9.0401])

Batch 5:
x: tensor([2.4242, 2.7273, 2.8283, 1.3131, 7.2727, 0.5051, 3.2323, 6.7677, 5.3535,
        0.2020])
y

In [20]:
model = torch.nn.Linear(1, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10):
    for x_batch, y_batch in dataloader:
        y_pred = model(x_batch)
        loss = criterion(y_pred, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 1.3138
Epoch 2, Loss: 1.4318
Epoch 3, Loss: 0.7695
Epoch 4, Loss: 1.3125
Epoch 5, Loss: 1.5634
Epoch 6, Loss: 2.2426
Epoch 7, Loss: 1.4676
Epoch 8, Loss: 1.6393
Epoch 9, Loss: 1.1516
Epoch 10, Loss: 0.8592


### Exercise: Use the dataset and dataloader objects on an NLP dataset

## HuggingFace

In [1]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-10-05 01:20:18.111759: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759627218.297512      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759627218.348261      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [3]:
texts = [
    "I love this movie!",
    "It was an average experience.",
    "I hate this film so much."
]

inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)


In [5]:
for text, pred in zip(texts, predictions):
    print(f"Text: {text}")
    print(f"Predicted Sentiment: {pred.item() + 1} ")

Text: I love this movie!
Predicted Sentiment: 5 
Text: It was an average experience.
Predicted Sentiment: 3 
Text: I hate this film so much.
Predicted Sentiment: 1 
