In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from datasets import Dataset,concatenate_datasets,DatasetDict
from transformers import AutoTokenizer

In [None]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_ACC_TOK")

In [None]:
df=pd.read_csv('/kaggle/input/financial-news-headlines/reuters_headlines.csv')

In [None]:
df.head()

Unnamed: 0,Headlines,Time,Description
0,TikTok considers London and other locations fo...,Jul 18 2020,TikTok has been in discussions with the UK gov...
1,Disney cuts ad spending on Facebook amid growi...,Jul 18 2020,Walt Disney has become the latest company to ...
2,Trail of missing Wirecard executive leads to B...,Jul 18 2020,Former Wirecard chief operating officer Jan M...
3,Twitter says attackers downloaded data from up...,Jul 18 2020,Twitter Inc said on Saturday that hackers were...
4,U.S. Republicans seek liability protections as...,Jul 17 2020,A battle in the U.S. Congress over a new coron...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32770 entries, 0 to 32769
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Headlines    32770 non-null  object
 1   Time         32770 non-null  object
 2   Description  32770 non-null  object
dtypes: object(3)
memory usage: 768.2+ KB


In [None]:
df.isnull().sum()

Headlines      0
Time           0
Description    0
dtype: int64

In [None]:
df_original=df.copy()

In [None]:
df=df_original.loc[:,df_original.columns[[0,2]]]

# Text Analysis

## Text Length - Characters

In [None]:
df_text_length=pd.DataFrame()
for x in df.columns:
    df_text_length[x]=df[x].str.len()

In [None]:
df_text_length.describe()

Unnamed: 0,Headlines,Description
count,32770.0,32770.0
mean,65.290571,213.985383
std,10.675765,40.179914
min,20.0,23.0
25%,58.0,191.0
50%,65.0,219.0
75%,72.0,239.0
max,117.0,500.0


## Average Word Length

In [None]:
df_avg_word_length=pd.DataFrame()
for x in df.columns:
    df_avg_word_length[x]=df[x].apply(
        lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0
    )

In [None]:
df_avg_word_length.describe()

Unnamed: 0,Headlines,Description
count,32770.0,32770.0
mean,5.496406,5.265801
std,0.79032,0.468275
min,3.166667,3.692308
25%,4.923077,4.95122
50%,5.428571,5.235294
75%,6.0,5.541667
max,10.75,18.32


## Word Count

In [None]:
df_word_count=pd.DataFrame()
for x in df.columns:
    df_word_count[x]=df[x].apply(lambda x: len(word_tokenize(x)))

In [None]:
df_word_count.describe()

Unnamed: 0,Headlines,Description
count,32770.0,32770.0
mean,11.289136,37.694507
std,2.279351,7.293172
min,3.0,6.0
25%,10.0,33.0
50%,11.0,38.0
75%,13.0,42.0
max,22.0,99.0


## Sentence Count

In [None]:
df_sentence_count=pd.DataFrame()
for x in df.columns:
    df_sentence_count[x]=df[x].apply(lambda x: len(sent_tokenize(x)))

In [None]:
df_sentence_count.describe()

Unnamed: 0,Headlines,Description
count,32770.0,32770.0
mean,1.009155,1.024138
std,0.098395,0.178484
min,1.0,1.0
25%,1.0,1.0
50%,1.0,1.0
75%,1.0,1.0
max,4.0,8.0


# Directive

In [None]:
df.iloc[:,1]=df.iloc[:,1].apply(lambda x: f'entitle: {x} ')

In [None]:
df.head()

Unnamed: 0,Headlines,Description
0,TikTok considers London and other locations fo...,entitle: TikTok has been in discussions with t...
1,Disney cuts ad spending on Facebook amid growi...,entitle: Walt Disney has become the latest co...
2,Trail of missing Wirecard executive leads to B...,entitle: Former Wirecard chief operating offi...
3,Twitter says attackers downloaded data from up...,entitle: Twitter Inc said on Saturday that hac...
4,U.S. Republicans seek liability protections as...,entitle: A battle in the U.S. Congress over a ...


# HuggingFace Dataset

In [None]:
hf_dataset = Dataset.from_pandas(df)

In [None]:
hf_dataset=hf_dataset.train_test_split(test_size=0.25, seed=42)
hf_dataset_train=hf_dataset['train']
hf_dataset_test=hf_dataset['test']

In [None]:
hf_dataset_train=hf_dataset_train.train_test_split(test_size=0.2, seed=42)
hf_dataset_val=hf_dataset_train['test']
hf_dataset_train=hf_dataset_train['train']

In [None]:
hf_dataset = DatasetDict({
    "train":hf_dataset_train,
    "val":hf_dataset_val,
    "test":hf_dataset_test,})

In [None]:
model_name="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples["Description"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(text_target=examples["Headlines"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = hf_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset.push_to_hub("Ankonbh/Financial-News-Headlines-Reuters")