# Imports

In [7]:
import torch
import torch.nn as nn 
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

import os

# Hyperparameters

In [60]:
BATCH_SIZE = 32
CONTEXT_SIZE = 3
DATA_PATH = '../business'

# Load data

In [8]:
file_list = [f for f in os.listdir(DATA_PATH) if f.endswith('.txt')]

In [9]:
data = {'File Name': [], 'Text': []}

for file_name in file_list:
    file_path = os.path.join(DATA_PATH, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        data['File Name'].append(file_name)
        data['Text'].append(content)

df = pd.DataFrame(data)


In [10]:
df.head()

Unnamed: 0,File Name,Text
0,289.txt,UK economy facing 'major risks'\n\nThe UK manu...
1,504.txt,Aids and climate top Davos agenda\n\nClimate c...
2,262.txt,Asian quake hits European shares\n\nShares in ...
3,276.txt,India power shares jump on debut\n\nShares in ...
4,510.txt,Lacroix label bought by US firm\n\nLuxury good...


In [11]:
# Remove the first line (title) from the 'Text' column
test_df = df.copy()
test_df['Text'] = test_df['Text'].str.split('\n', 1).str[1]


# Remove all white spaces from the 'Text' column
test_df['Text'] = test_df['Text'].str.replace(r'\n', '', regex=True)

print(test_df["Text"].iloc[0])

The UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said.The group's quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.Manufacturers' domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year."Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said. "These results reinforce our concern over the sector's persistent inability to sustain recovery." The outlook

# Create windows

In [12]:
ex = test_df["Text"].iloc[0]
ex

'The UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said.The group\'s quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.Manufacturers\' domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year."Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said. "These results reinforce our concern over the sector\'s persistent inability to sustain recovery." The out

In [13]:
context_size = 3
windows = [ex.split()[i: i+context_size] + ex.split()[i+context_size+1: i+2*context_size+1] for i in range(len(ex.split()) - 2 * context_size)]
labels = [ex.split()[i+context_size] for i in range(len(ex.split()) - context_size)]

In [14]:
for i in range(len(windows)):
    print(f"When the INPUT is: {windows[i]}, the OUTPUT is: {labels[i]}")
    print(f"---------------------\n")

When the INPUT is: ['The', 'UK', 'manufacturing', 'will', 'continue', 'to'], the OUTPUT is: sector
---------------------

When the INPUT is: ['UK', 'manufacturing', 'sector', 'continue', 'to', 'face'], the OUTPUT is: will
---------------------

When the INPUT is: ['manufacturing', 'sector', 'will', 'to', 'face', '"serious'], the OUTPUT is: continue
---------------------

When the INPUT is: ['sector', 'will', 'continue', 'face', '"serious', 'challenges"'], the OUTPUT is: to
---------------------

When the INPUT is: ['will', 'continue', 'to', '"serious', 'challenges"', 'over'], the OUTPUT is: face
---------------------

When the INPUT is: ['continue', 'to', 'face', 'challenges"', 'over', 'the'], the OUTPUT is: "serious
---------------------

When the INPUT is: ['to', 'face', '"serious', 'over', 'the', 'next'], the OUTPUT is: challenges"
---------------------

When the INPUT is: ['face', '"serious', 'challenges"', 'the', 'next', 'two'], the OUTPUT is: over
---------------------

When the 

In [24]:
def get_data(data_path):
    file_list = [f for f in os.listdir(DATA_PATH) if f.endswith('.txt')]
    data = {'File Name': [], 'Text': []}

    for file_name in file_list:
        file_path = os.path.join(DATA_PATH, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            data['File Name'].append(file_name)
            data['Text'].append(content)

    df = pd.DataFrame(data)

    # Remove the first line (title) from the 'Text' column
    df['Text'] = df['Text'].str.split('\n', 1).str[1]
    
    # Remove all white spaces from the 'Text' column
    df['Text'] = df['Text'].str.replace(r'\n', '', regex=True)

    return df["Text"]
    

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Creating a Text Dataset

In [59]:
class TextDataset(Dataset):
    def __init__(self, windows, labels, tokenizer):
        self.windows = windows
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, index):
        window = self.windows[index]
        label = self.labels[index]

        # Tokenize the text
        tokenized_window = self.tokenizer(window, truncation=False, padding=False, add_special_tokens=False, return_tensors="pt")["input_ids"]
        tokenized_label = self.tokenizer(label, truncation=False, padding=False, add_special_tokens=False, return_tensors="pt")["input_ids"]

        return {
            "window": tokenized_window,
            "label": tokenized_label
        }

In [None]:
train_ds = TextDataset(X_train, y_train, tokenizer)
val_ds = TextDataset(X_val, y_val, tokenizer)

In [45]:
# Create data loaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

tensor([[ 101, 1109, 1993, 5863, 4291, 1209, 2760, 1106, 1339,  102]])