# Imports

In [1]:
import torch
import torch.nn as nn 
import pandas as pd
from torch.nn import functional as F
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader

import os

# Load data

In [2]:
directory_path = '../business'
file_list = [f for f in os.listdir(directory_path) if f.endswith('.txt')]

In [3]:
data = {'File Name': [], 'Text': []}

for file_name in file_list:
    file_path = os.path.join(directory_path, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        data['File Name'].append(file_name)
        data['Text'].append(content)

df = pd.DataFrame(data)


In [4]:
df.head()

Unnamed: 0,File Name,Text
0,001.txt,Ad sales boost Time Warner profit\n\nQuarterly...
1,002.txt,Dollar gains on Greenspan speech\n\nThe dollar...
2,003.txt,Yukos unit buyer faces loan claim\n\nThe owner...
3,004.txt,High fuel prices hit BA's profits\n\nBritish A...
4,005.txt,Pernod takeover talk lifts Domecq\n\nShares in...


In [5]:
# Remove the first line (title) from the 'Text' column
test_df = df.copy()
test_df['Text'] = test_df['Text'].str.split('\n', 1).str[1]


# Remove all white spaces from the 'Text' column
test_df['Text'] = test_df['Text'].str.replace(r'\n', '', regex=True)

print(test_df["Text"].iloc[0])

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed br

  test_df['Text'] = test_df['Text'].str.split('\n', 1).str[1]


# Create windows

In [6]:
ex = test_df["Text"].iloc[0]
ex

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers for high-speed

In [7]:
context_size = 3
windows = [ex.split()[i: i+context_size] + ex.split()[i+context_size+1: i+2*context_size+1] for i in range(len(ex.split()) - 2 * context_size)]
labels = [ex.split()[i+context_size] for i in range(len(ex.split()) - context_size)]

In [9]:
def get_data(data_path):
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    data = {'File Name': [], 'Text': []}

    for file_name in file_list:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            data['File Name'].append(file_name)
            data['Text'].append(content)

    df = pd.DataFrame(data)

    # Remove the first line (title) from the 'Text' column
    df['Text'] = df['Text'].str.split('\n', 1).str[1]
    
    # Remove all white spaces from the 'Text' column
    df['Text'] = df['Text'].str.replace(r'\n', '', regex=True)

    return df["Text"]
    

In [12]:
def create_windows_dataframe(data, context_size):
    all_windows = []
    all_labels = []

    for index, row in data.iterrows():
        input_sequence = row["Text"].split()

        num_windows = len(input_sequence) - 2 * context_size

        for i in range(num_windows):
            window = input_sequence[i: i + context_size] + input_sequence[i + context_size + 1: i + 2 * context_size + 1]
            label = input_sequence[i + context_size]
            all_windows.append(window)
            all_labels.append(label)

    # Create a pandas DataFrame from the lists
    windows_df = pd.DataFrame({
        'windows': all_windows,
        'labels': all_labels
    })

    return windows_df

# Assuming you have a DataFrame named 'test_df' with a column named 'Text'
context_size = 3
windows_dataframe = create_windows_dataframe(test_df, context_size)

windows_dataframe.head()


Unnamed: 0,windows,labels
0,"[Quarterly, profits, at, media, giant, TimeWar...",US
1,"[profits, at, US, giant, TimeWarner, jumped]",media
2,"[at, US, media, TimeWarner, jumped, 76%]",giant
3,"[US, media, giant, jumped, 76%, to]",TimeWarner
4,"[media, giant, TimeWarner, 76%, to, $1.13bn]",jumped


In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Creating a Text Dataset

In [57]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, seq_len, context_size):
        self.data = data
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.context_size = context_size

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data = self.data[index]

        # Tokenize the text
        tokenized_data = self.tokenizer(data, max_length=self.seq_len, truncation=True, padding="max_length", return_tensors="pt")["input_ids"]

        # Create windows and labels
        windows, labels = tokenized_data

        return {
            "windows": windows,
            "labels": labels
        }
        

In [35]:
all_data = get_data(directory_path)
data = all_data[0]

tokenized_data = tokenizer(data, max_length=10, truncation=True, padding="max_length", return_tensors="pt")["input_ids"]

In [45]:
tokenized_data

tensor([[ 101, 1109, 1993, 5863, 4291, 1209, 2760, 1106, 1339,  102]])

In [52]:
def create_windows(data, context_size):
    all_windows = []
    all_labels = []

    for i in range(data.shape[0]):
        input_sequence = data[i]  # Assuming data[i] is a PyTorch tensor

        num_windows = len(input_sequence) - 2 * context_size

        for i in range(num_windows):
            window = torch.cat((input_sequence[i: i + context_size], input_sequence[i + context_size + 1: i + 2 * context_size + 1]), dim=0)
            label = input_sequence[i + context_size]
            all_windows.append(window)
            all_labels.append(label)

    return all_windows, all_labels


all_windows, all_labels = create_windows(tokenized_data, context_size)
print("All Windows:", all_windows)
print("All Labels:", all_labels)

All Windows: [tensor([ 101, 1109, 1993, 4291, 1209, 2760]), tensor([1109, 1993, 5863, 1209, 2760, 1106]), tensor([1993, 5863, 4291, 2760, 1106, 1339]), tensor([5863, 4291, 1209, 1106, 1339,  102])]
All Labels: [tensor(5863), tensor(4291), tensor(1209), tensor(2760)]


In [53]:
all_windows

[tensor([ 101, 1109, 1993, 4291, 1209, 2760]),
 tensor([1109, 1993, 5863, 1209, 2760, 1106]),
 tensor([1993, 5863, 4291, 2760, 1106, 1339]),
 tensor([5863, 4291, 1209, 1106, 1339,  102])]

In [54]:
all_labels

[tensor(5863), tensor(4291), tensor(1209), tensor(2760)]