# Imports

In [3]:
import torch
import torch.nn as nn 
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import os

# Hyperparameters

In [4]:
BATCH_SIZE = 32
CONTEXT_SIZE = 3
DATA_PATH = '../business'

# Load data

In [5]:
file_list = [f for f in os.listdir(DATA_PATH) if f.endswith('.txt')]
data = {'File Name': [], 'Text': []}

for file_name in file_list:
    file_path = os.path.join(DATA_PATH, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        data['File Name'].append(file_name)
        data['Text'].append(content)

df = pd.DataFrame(data)
df.head()

Unnamed: 0,File Name,Text
0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...
1,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...
2,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...
3,004.txt,High fuel prices hit BA's profits\n\nBritish A...
4,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...


In [6]:
# Remove the first line (title) from the 'Text' column
test_df = df.copy()
test_df['Text'] = test_df['Text'].str.split('\n', 1).str[1]

# Remove all white spaces from the 'Text' column
test_df['Text'] = test_df['Text'].str.replace(r'\n', '', regex=True)

  test_df['Text'] = test_df['Text'].str.split('\n', 1).str[1]


# Create windows

In [7]:
def get_data(data_path):
    file_list = [f for f in os.listdir(DATA_PATH) if f.endswith('.txt')]
    data = {'File Name': [], 'Text': []}

    for file_name in file_list:
        file_path = os.path.join(DATA_PATH, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            data['File Name'].append(file_name)
            data['Text'].append(content)

    df = pd.DataFrame(data)

    # Remove the first line (title) from the 'Text' column
    df['Text'] = df['Text'].str.split('\n', 1).str[1]
    
    # Remove all white spaces from the 'Text' column
    df['Text'] = df['Text'].str.replace(r'\n', '', regex=True)

    return df["Text"]
    

In [15]:
def create_windows_dataframe(data, context_size):
    all_windows_str = []  # Modified to store windows as strings
    all_labels = []

    for index, row in data.iterrows():
        input_sequence = row["Text"].split()

        num_windows = len(input_sequence) - 2 * context_size

        for i in range(num_windows):
            window = input_sequence[i: i + context_size] + input_sequence[i + context_size + 1: i + 2 * context_size + 1]
            window_str = " ".join(window)  # Convert the window list to a string
            label = input_sequence[i + context_size]
            all_windows_str.append(window_str)  # Append the window string
            all_labels.append(label)

    # Create a pandas DataFrame from the lists
    windows_df = pd.DataFrame({
        'windows': all_windows_str,  # Use the modified list containing window strings
        'labels': all_labels
    })

    return windows_df

# Assuming you have a DataFrame named 'test_df' with a column named 'Text'
windows_dataframe = create_windows_dataframe(test_df, CONTEXT_SIZE)

windows_dataframe.head()


Unnamed: 0,windows,labels
0,Quarterly profits at media giant TimeWarner,US
1,profits at US giant TimeWarner jumped,media
2,at US media TimeWarner jumped 76%,giant
3,US media giant jumped 76% to,TimeWarner
4,media giant TimeWarner 76% to $1.13bn,jumped


In [9]:
# Split the data into train and val
X, y = windows_dataframe["windows"], windows_dataframe["labels"]
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

X_train = X_train.to_list()
X_val = X_val.to_list()
y_train = y_train.to_list()
y_val = y_val.to_list()

# Creating a Text Dataset

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

class TextDataset(Dataset):
    def __init__(self, windows, labels, tokenizer):
        self.windows = windows
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.windows)
    
    def __getitem__(self, index):
        window = self.windows[index]
        label = self.labels[index]

        # Tokenize the text
        tokenized_window = self.tokenizer(window, truncation=False, padding=False, add_special_tokens=False, return_tensors="pt")["input_ids"]
        tokenized_label = self.tokenizer(label, truncation=False, padding=False, add_special_tokens=False, return_tensors="pt")["input_ids"]

        return {
            "window": tokenized_window,
            "label": tokenized_label
        }

In [11]:
train_ds = TextDataset(X_train, y_train, tokenizer)
val_ds = TextDataset(X_val, y_val, tokenizer)

In [12]:
# Create data loaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
next(iter(train_loader))

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).