<a href="https://colab.research.google.com/github/SanskarGithub07/FDS-Implementation/blob/main/fds_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Collection and Cleaning
- Collecting data from kaggle and using the PennTreebank dataset for our implementation of the model.
- Cleaning the data for better representation and model accuracy

In [1]:
import pandas as pd
import numpy
import collections

In [2]:
def read_words(filename):
    with open(filename, "r", encoding="utf-8", errors="replace") as f:
        return f.read().replace("\n", "<eos>").split()

In [3]:
def build_vocab(filename):
    data = read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = zip(*count_pairs)
    word_to_id = {word: i for i, word in enumerate(words)}
    return word_to_id

In [4]:
def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[word] for word in data if word in word_to_id]

In [5]:
train_path = "ptb.train.txt"
valid_path = "ptb.valid.txt"
test_path = "ptb.test.txt"
train_path, valid_path, test_path

('ptb.train.txt', 'ptb.valid.txt', 'ptb.test.txt')

In [6]:
def load_ptb_dataset():

    word_to_id = build_vocab(train_path)

    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocab_size = len(word_to_id)

    return train_data, valid_data, test_data, vocab_size, word_to_id

train_data, valid_data, test_data, vocab_size, word_to_id = load_ptb_dataset()

In [7]:
print(f"Vocabulary size: {vocab_size}",f"Train data size: {len(train_data)}",
f"Valid data size: {len(valid_data)}", f"Test data size: {len(test_data)}")
print(f"\n\nSample word to id mapping: {list(word_to_id.items())[:5]}")

Vocabulary size: 10000 Train data size: 929589 Valid data size: 73760 Test data size: 82430


Sample word to id mapping: [('the', 0), ('<unk>', 1), ('<eos>', 2), ('N', 3), ('of', 4)]


In [8]:
def id_to_word(id_list, word_to_id=word_to_id):
    id_to_word_dict = {v: k for k, v in word_to_id.items()}
    return [id_to_word_dict[id_] for id_ in id_list]

print(id_to_word(train_data[0:100]))

['aer', 'banknote', 'berlitz', 'calloway', 'centrust', 'cluett', 'fromstein', 'gitano', 'guterman', 'hydro-quebec', 'ipo', 'kia', 'memotec', 'mlx', 'nahb', 'punts', 'rake', 'regatta', 'rubens', 'sim', 'snack-food', 'ssangyong', 'swapo', 'wachter', '<eos>', 'pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', 'N', '<eos>', 'mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', 'the', 'dutch', 'publishing', 'group', '<eos>', 'rudolph', '<unk>', 'N', 'years', 'old', 'and', 'former', 'chairman', 'of', 'consolidated', 'gold', 'fields', 'plc', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'british', 'industrial', 'conglomerate', '<eos>', 'a', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of']


In [9]:
print(train_data[0:100])

[9970, 9971, 9972, 9974, 9975, 9976, 9980, 9981, 9982, 9983, 9984, 9986, 9987, 9988, 9989, 9991, 9992, 9993, 9994, 9995, 9996, 9997, 9998, 9999, 2, 9256, 1, 3, 72, 393, 33, 2133, 0, 146, 19, 6, 9207, 276, 407, 3, 2, 23, 1, 13, 141, 4, 1, 5465, 0, 3081, 1596, 96, 2, 7682, 1, 3, 72, 393, 8, 337, 141, 4, 2477, 657, 2170, 955, 24, 521, 6, 9207, 276, 4, 39, 303, 438, 3684, 2, 6, 942, 4, 3150, 496, 263, 5, 138, 6092, 4241, 6036, 30, 988, 6, 241, 760, 4, 1015, 2786, 211, 6, 96, 4]
