# Data Extraction and Processing

## Option1 - Already Processed Data (through LinkedIn article)

In [24]:
import os
import requests


enhanced_path = 'data/enhanced'
os.makedirs(enhanced_path, exist_ok=True)

file_name = "shakespeare.txt"
if not os.path.isfile(file_name):
	url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	data = requests.get(url)

	with open(os.path.join(enhanced_path, file_name), 'w') as f:
		f.write(data.text)

In [27]:
with open(os.path.join('data', file_name), 'r') as f:
	texts = f.readlines()

test_portion = 0.1
idx = int(len(texts)*(1-test_portion))
while not texts[idx] == '\n':  # test data should start from a valid block (not in the middle of a conversation
    idx += 1

test_texts = texts[idx:]
train_texts = texts[:idx]
print(len(train_texts), len(test_texts))

with open(os.path.join(enhanced_path, 'test.txt'), 'w') as f:
	f.write(''.join(test_texts))

with open(os.path.join(enhanced_path, 'train.txt'), 'w') as f:
	f.write(''.join(train_texts))

36009 3991


## Option2 - Manually download raw kagle dataset and process

In [1]:
import pandas as pd

data = pd.read_csv('../data/raw/Shakespeare_data.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111396 entries, 0 to 111395
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          111396 non-null  int64  
 1   Play              111396 non-null  object 
 2   PlayerLinenumber  111393 non-null  float64
 3   ActSceneLine      105153 non-null  object 
 4   Player            111389 non-null  object 
 5   PlayerLine        111396 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 5.1+ MB


In [2]:
print(data.shape)
data.head(5).tail(3)

(111396, 6)


Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [13]:
test_portion = 0.1
train_acts = int(len(data) * (1 - test_portion))
train = data.iloc[:train_acts, :]  # no shuffle
test = data.iloc[train_acts:, :]
print(train.shape, test.shape)

(100256, 6) (11140, 6)


In [16]:
test.to_csv('../data/test.csv', index=False)
train.to_csv('../data/train.csv', index=False)

## Option3 - Process the raw txt (with tokenizer splits)

In [5]:
import os
os.chdir('..')

In [9]:
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split

text_data = []
data_dir = 'data/raw'
for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            text_data.append(text)

corpus_text = '\n'.join(text_data)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_text = tokenizer.tokenize(corpus_text)
train_text, test_text = train_test_split(tokenized_text, test_size=0.1, random_state=42, shuffle=False)


output_dir = 'data/processed'
os.makedirs(output_dir, exist_ok=True)

with open(os.path.join(output_dir, 'train_data.txt'), 'w', encoding='utf-8') as file:
    file.write(tokenizer.convert_tokens_to_string(train_text))

with open(os.path.join(output_dir, 'test_data.txt'), 'w', encoding='utf-8') as file:
    file.write(tokenizer.convert_tokens_to_string(test_text))

In [31]:
data_dir = 'data/raw'
character_dialogues = {}
current_character = None

for filename in os.listdir(data_dir):
    if filename.endswith('.txt'):
        with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
            lines = file.readlines()
            for line in lines:
                # Check if the line contains the character's name (e.g., "First Citizen:")
                if line.strip().endswith(":"):
                    current_character = line.strip()
                    if current_character not in character_dialogues:
                        character_dialogues[current_character] = []
                elif current_character:
                    character_dialogues[current_character].append(line.strip())

formatted_dialogues = []
for character, dialogues in character_dialogues.items():
    formatted_dialogues.append(character)
    formatted_dialogues.extend(dialogues)

with open(os.path.join(output_dir, 'formatted_dialogues.txt'), 'w', encoding='utf-8') as file:
    file.write('\n'.join(formatted_dialogues))

**You will end up with three different folders in the data ("raw", "enhanced" and "processed") where "raw" data refers to Kaggle, "enhanced" to the one processed over the input.txt file in Github, and "processed" reffering to the last method.**