# Question 2

In [1]:
# Supresses NonCritical Warnings of Tensorflow
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [2]:
# This code imports the TensorFlow library and then enables memory growth for GPU devices, if any are available.
# Important for some runtime errors during model execution
import tensorflow as tf

# Enable memory growth
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [3]:
## !pip install evaluate

In [4]:
import requests
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam
import evaluate

## Question 2.1

### Create your own dataset for text classification. It should contain at least 1000 words in total and at least two categories with at least 100 examples per category.

Created the dataset by scraping the web.

In [5]:
headers = ({'User Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Accept-Language':'en-US, en;q=0.5'})

In [6]:
url = 'https://newsapi.org/v2/top-headlines'

In [7]:
webpage = requests.get(url, headers=headers)

In [8]:
webpage # should get response [200]

<Response [401]>

In [9]:
# set the request parameters
params = {
    'category': 'technology',
    'language': 'en',
    'pageSize': 100,
    'apiKey': 'e30743332dd1426eb170927023ba09d7'
}

In [10]:
response = requests.get(url, params=params)

In [11]:
#response.content

In [12]:
type(response.content)

bytes

In [13]:
articles = response.json()['articles']

In [14]:
Tech_titles = [article['title'].split(' - ')[0].split(' | ')[0] for article in articles]

In [15]:
for title in Tech_titles:
    print(title)

Beeper Mini's iMessage integration is on the fritz
PUBG Mobile guide to avoid dying (December 2023)
Apple's iPhone and Watch Product Design Chief to Leave in Shake-Up
Den of Wolves
Minecraft – Jurassic World DLC – Nintendo Switch
Round 2: We test the new Gemini-powered Bard against ChatGPT
Apple Wallet ticket sharing is becoming more like NameDrop
GTA Vice City map vs GTA 6 map: Everything known so far from leaks and trailer
A brain without a body can now be kept alive for hours, thanks to new tech
Hideo Kojima Teams With Jordan Peele for Upcoming Horror Game ‘OD,’ Announces His ‘Death Stranding’ Docu Will Stream on Disney+
Google’s AI-assisted NotebookLM note-taking app is now open to users in the US
Google faces controversy over edited Gemini AI demo video
Chimney for modern homes: Here are top 10 options to consider before buying one
The Game Awards 2023: All The Biggest Announcements
Battlegrounds Mobile India (BGMI): Mid-range guide (December 2023)
This brand made an $1,100 edible

In [16]:
len(Tech_titles)

100

In [17]:
# set the request parameters
params = {
    'category': 'business',
    'language': 'en',
    'pageSize': 100,
    'apiKey': 'e30743332dd1426eb170927023ba09d7'
}

# send the request and get the response
response = requests.get(url, params=params)

# get the 'articles' list from the response JSON data
articles = response.json()['articles']

# extract the titles from the articles aslo splitting the source from titles
Business_titles = [article['title'].split(' - ')[0].split(' | ')[0] for article in articles]

# print the titles
for title in Business_titles:
    print(title)

Stock Faithful Ride $7 Trillion Rally as Market Timing Backfires
Why automakers are turning to hybrids in the middle of the industry's EV transition
SmileDirectClub Rescue Deal Falls Apart, Company to Liquidate
DHL Express strike
Starbucks Says It Wants Union Talks, Agreements in 2024
Exxon Mobil is one of most oversold names, could be primed for bounce
Paramount stock surges more than 14% as sale chatter mounts
S&P 500 is little changed, on pace to snap 5-week win streak as comeback rally pauses: Live updates
New McDonald's spinoff restaurant CosMc's officially opens in a Chicago suburb
Bulk Deals: Softbank exits Zomato, GQG Partners buys Rs 1,671.5 cr shares in GMR Airports Infra
Monitoring panel clears directors for Rel Naval board
New laws taking effect in Minnesota on Jan. 1, 2024
Elon Musk Throws Fresh Barbs at Disney's Bob Iger
Kawasaki W175 Street Launched In India At Rs. 1.35 Lakh
Paramount, RH, Carrier Global, Lululemon, First Solar, and More Market Movers
Stocks gain, Treasu

In [18]:
len(Business_titles)

100

In [19]:
data_list = []
for title in Tech_titles:
    data_list.append({'sentence': str(title), 'label': 0}) 

for title in Business_titles:
    data_list.append({'sentence': str(title), 'label': 1}) 

In [20]:
data = pd.DataFrame(data_list)
print(data)

                                              sentence  label
0    Beeper Mini's iMessage integration is on the f...      0
1     PUBG Mobile guide to avoid dying (December 2023)      0
2    Apple's iPhone and Watch Product Design Chief ...      0
3                                        Den of Wolves      0
4     Minecraft – Jurassic World DLC – Nintendo Switch      0
..                                                 ...    ...
195           Off-Duty Pilot, Joseph Emerson, Released      1
196  Nifty 50, Sensex today: What to expect from st...      1
197  X begins rolling out Grok, its ‘rebellious’ ch...      1
198  House Republicans have once again voted to poi...      1
199  Stock Market Today: All You Need To Know Going...      1

[200 rows x 2 columns]


In [21]:
data = shuffle(data, random_state=987654321)
data.head()

Unnamed: 0,sentence,label
155,The biggest study of ‘greedflation’ yet looked...,1
175,Apple's strategic pivot: A quarter of world's ...,1
124,The self-checkout reversal is growing,1
11,Google faces controversy over edited Gemini AI...,0
23,For some reason Yamaha is launching a 5-disk C...,0


In [22]:
X_train = data.drop('label', axis=1)
y_train = data['label']

In [23]:
len(X_train)

200

In [24]:
len(y_train)

200

## Question 2.2

### Split the dataset into training (at least 160 examples) and test (at least 40 examples) sets.

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=40, random_state=987654321)

In [26]:
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

160
160
40
40


## Question 2.3

### Fine tune a pretrained language model capable of generating text (e.g., GPT) that you can take from the Hugging Face Transformers library with the dataset your created (this tutorial could be very helpful: https://huggingface.co/docs/transformers/training).

In [27]:
import tensorflow as tf
tf.keras.backend.clear_session()
tf.random.set_seed(987654321)
np.random.seed(987654321)

In [28]:
## !pip install --upgrade transformers

**Using Albert-base-v2**

In [29]:
# initializing a tokenizer and a pre-trained model for sequence classification using the ALBERT-base-v2 architecture
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

model = TFAutoModelForSequenceClassification.from_pretrained("albert-base-v2")

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
X_train = dict(tokenizer([str(i) for i in X_train['sentence']], return_tensors='np', padding=True))
X_test = dict(tokenizer([str(i) for i in X_test['sentence']], return_tensors='np', padding=True))

In [31]:
model.compile(optimizer=Adam(1e-5))
model.fit(X_train, y_train,epochs=4, batch_size=80)



Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x17d618c50>

In [32]:
preds = model.predict(X_test)["logits"]



In [33]:
y_pred = np.argmax(preds, axis=1)

### Report the test accuracy.

In [34]:
metric = evaluate.load('accuracy')
metric.compute(predictions=y_pred, references=np.array(y_test))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.775}

### Discuss what could be done to improve accuracy.

We can see here that when we compile the model with 4 epochs, the training loss is reduced at each step. When we calculate the accuracy, we get 77.5% accuracy. If we increase the epochs, we can potentially obtain better accuracy, but there is also a risk of overfitting the model. Hence, we will test the model with epoch = 20 to further evaluate its performance. However, we can add regularization techniques like dropout to prevent overfitting. We can also experiment with different batch sizes. A smaller batch size may allow the model to generalize better.

In [35]:
tf.keras.backend.clear_session()
tf.random.set_seed(987654321)
np.random.seed(987654321)

In [36]:
model.compile(optimizer=Adam(1e-5))
model.fit(X_train, y_train,epochs=20, batch_size=80)



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x28ade4610>

In [37]:
preds = model.predict(X_test)["logits"]



In [38]:
y_pred = np.argmax(preds, axis=1)

In [39]:
metric = evaluate.load('accuracy')
metric.compute(predictions=y_pred, references=np.array(y_test))

{'accuracy': 0.85}

After training the model with 20 epochs, we observed a decrease in the training loss at each step. The accuracy obtained was 85%, which is higher than the accuracy obtained with 4 epochs. However, we could have achieved even higher accuracy if we had used a larger model.

### Trying different models to gain a better understanding.

In [40]:
tf.keras.backend.clear_session()
tf.random.set_seed(987654321)
np.random.seed(987654321)

**Using GPT-J**
> Computationally expensive since the model requires around 24.2 GB of memory space to download.

> Too big to be used on regular hardware: wouldn't fit in RAM.

In [41]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

model = TFAutoModelForSequenceClassification.from_pretrained("EleutherAI/gpt-j-6B")

model.compile(optimizer=Adam(1e-5))
model.fit(X_train, y_train,epochs=20, batch_size=80)

preds = model.predict(X_test)["logits"]

y_pred = np.argmax(preds, axis=1)

metric = evaluate.load('accuracy')
metric.compute(predictions=y_pred, references=np.array(y_test))

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

KeyboardInterrupt: 

**Using GPT-Sw3**
> Not released publicly: requires token access.

In [42]:
tokenizer = AutoTokenizer.from_pretrained("AI-Sweden/gpt-sw3-356m")

model = TFAutoModelForSequenceClassification.from_pretrained("AI-Sweden/gpt-sw3-356m")

model.compile(optimizer=Adam(1e-5))
model.fit(X_train, y_train,epochs=20, batch_size=80)

preds = model.predict(X_test)["logits"]

y_pred = np.argmax(preds, axis=1)

metric = evaluate.load('accuracy')
metric.compute(predictions=y_pred, references=np.array(y_test))

OSError: AI-Sweden/gpt-sw3-356m is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`