In [1]:
# get the evaluate library.
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
# import libraries. 

import numpy as np
import pandas as pd 
import transformers
import tensorflow as tf 
import evaluate 
from sklearn.model_selection import train_test_split
import datasets

In [3]:
# load dataset.

business_data = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/business_data.csv')
education_data = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/education_data.csv')
entertainment_data = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/entertainment_data.csv')
sports_data = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/sports_data.csv')
technology_data = pd.read_csv('/kaggle/input/news-articles-classification-dataset-for-nlp-and-ml/technology_data.csv')

# merging all data + generating category codes.
full_df = pd.concat([business_data, education_data, entertainment_data, sports_data, technology_data], axis = 0)
full_df.drop(columns=['headlines','description','url'], inplace = True)
full_df['label'] = full_df['category'].astype('category').cat.codes
full_df.head()

Unnamed: 0,content,category,label
0,"Sitharaman, the first full-time woman finance ...",business,0
1,The merger of Tata group’s budget airlines Air...,business,0
2,The Air India group plans to induct one aircra...,business,0
3,Indian exporters have asked the central govern...,business,0
4,The Air India group plans to induct one aircra...,business,0


In [4]:
# code for classification model. 

# dataset.
df_classification = full_df.drop(columns='category')
train_cdf, test_cdf = train_test_split(df_classification, test_size=0.2)
ds = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(train_cdf),
    'test': datasets.Dataset.from_pandas(test_cdf)
})

# model.
model = "google-bert/bert-base-uncased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model)

# tokenizer function.
def tokenize(dataset):
    return tokenizer(dataset["content"], truncation=True, max_length=256)

# tokenize_ds.
tokenized_ds = ds.map(tokenize)
tokenized_ds

# optimizer.
batch_size = 16
num_epochs = 3
batches_per_epoch = len(tokenized_ds["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = transformers.create_optimizer(init_lr=3e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

# model instance.
id_label = {0: "BUSINESS", 1: "EDUCATION", 2: "ENTERTAINMENT", 3: "SPORTS", 4: "TECHNOLOGY"}
label_id = {"BUSINESS": 0, "EDUCATION": 1, "ENTERTAINMENT": 2, "SPORTS": 3, "TECHNOLOGY": 4}
model_instance = transformers.TFAutoModelForSequenceClassification.from_pretrained(model, num_labels=5, label2id=label_id, id2label=id_label)
model_instance.compile(optimizer=optimizer)

# to tf dataset format. 
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
train_set = model_instance.prepare_tf_dataset(
    tokenized_ds["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)
validation_set = model_instance.prepare_tf_dataset(
    tokenized_ds["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

# metrics. 
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

metric_callback = transformers.keras_callbacks.KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=validation_set)

# training.
callbacks = [metric_callback]
model_instance.fit(train_set, epochs=3, validation_data=validation_set, callbacks=callbacks)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7d978bd77220>

In [5]:
# model_instance.save("POC_classification_model_1.keras")

In [6]:
# summarization model. 

summarizer = transformers.pipeline("summarization", model="google-t5/t5-base")
full_df.iloc[0,:]['content']

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


'Sitharaman, the first full-time woman finance minister of the country, has presented five full budgets since July 2019 and will present an interim or vote-on-account budget next week.\nWith the presentation of the interim budget on February 1, Sitharaman will surpass the records of her predecessors like Manmohan Singh, Arun Jaitley, P Chidambaram, and Yashwant Sinha, who had presented five budgets in a row.\nDesai, as finance minister, had presented five annual budgets and one interim budget between 1959-1964. The interim budget 2024-25 to be presented by Sitharaman on February 1, will be a vote-on-account that will give the government authority to spend certain sums of money till a new government comes to office after the April-May general elections.\nADVERTISEMENT\nAs the Parliamentary elections are due, Sitharaman’s interim budget may not contain any major policy changes. Speaking at an industry event last month, Sitharaman had ruled out any “spectacular announcement” in the interi

In [7]:
summarizer(full_df.iloc[0,:]['content'])

[{'summary_text': 'the interim budget 2024-25 will be presented by sitharaman on February 1 . it will be a vote-on-account that will give the government authority to spend certain sums of money until a new government comes to office after the April-May general elections .'}]

In [8]:
import torch

In [9]:
full_df.iloc[3,:]['content']

'Indian exporters have asked the central government to help facilitate more credit as freight rates have jumped nearly 300 per cent due to the disruption in the Red Sea route forcing global shipping lines to take longer trade routes, which is ultimately affecting exports of low value items such as Basmati rice.\nIncreasing attacks on ships sailing in the Red Sea region since November 2023 have forced shippers to consider the alternative, longer route past the Cape of Good Hope, which has not only stretched delivery time by 15 to 20 days, but also increased the transit cost substantially because of incremental freight rates and insurance premium.\nFederation of Indian Export Organisations (FIEO) Director General Ajay Sahai told The Indian Express that freight costs have surged by 300 per cent as global shipping lines are taking the Cape of Good Hope route, which is why exporters have sought more credit to match the rising cost of shipments to Europe.\nADVERTISEMENT\nMeanwhile, ratings a

In [10]:
# text generation model. 

model = transformers.AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
     return_dict=True,
     torch_dtype=torch.float16,
     device_map="auto",
     trust_remote_code=True,
)
tokenizer = transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
messages = [
    {"role": "user", "content": full_df.iloc[3,:]['content'] + "When was this text generated"},
]
gen = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = gen(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Device set to use cuda:0
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
`get_max_cache()` is deprecated for all Cache classes. Use `get_max_cache_shape()` instead. Calling `get_max_cache()` will raise error from v4.48


<|user|>
Indian exporters have asked the central government to help facilitate more credit as freight rates have jumped nearly 300 per cent due to the disruption in the Red Sea route forcing global shipping lines to take longer trade routes, which is ultimately affecting exports of low value items such as Basmati rice.
Increasing attacks on ships sailing in the Red Sea region since November 2023 have forced shippers to consider the alternative, longer route past the Cape of Good Hope, which has not only stretched delivery time by 15 to 20 days, but also increased the transit cost substantially because of incremental freight rates and insurance premium.
Federation of Indian Export Organisations (FIEO) Director General Ajay Sahai told The Indian Express that freight costs have surged by 300 per cent as global shipping lines are taking the Cape of Good Hope route, which is why exporters have sought more credit to match the rising cost of shipments to Europe.
ADVERTISEMENT
Meanwhile, ratin