In [150]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [151]:
df = pd.read_csv("/home/darshan39/Downloads/100_Unique_QA_Dataset.csv")
# normalize column names and fill missing question/answer cells
df.columns = df.columns.str.strip()
if 'Question' in df.columns and 'Answer' in df.columns:
    df[['Question', 'Answer']] = df[['Question', 'Answer']].fillna('')
else:
    # show columns for debugging if expected columns not present
    print('Columns in dataframe:', df.columns.tolist())

Columns in dataframe: ['question', 'answer']


In [152]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [153]:
# Robust tokenizer that tolerates non-string inputs
import re
def tokenize(text):
    # ensure we work with a string and remove common punctuation
    if text is None:
        text = ''
    text = str(text)
    # replace question marks/apostrophes with space and remove other punctuation
    text = text.replace('?', ' ').replace("'", ' ')
    text = re.sub(r'[^0-9a-zA-Z\s]', ' ', text)
    return text.lower().strip().split()

In [154]:
tokenize("What's your name?")

['what', 's', 'your', 'name']

In [155]:
#vocalulary
vocab = {}
vocab = {'<UNK>': 0}

In [156]:
# Build vocabulary from dataframe rows (robust to missing columns/values)
def build_vocab_from_row(row):
    # Row can be a Series; use get with default empty string
    q_text = row.get('Question', '') if hasattr(row, 'get') else ''
    a_text = row.get('Answer', '') if hasattr(row, 'get') else ''
    q_tokens = tokenize(q_text)
    a_tokens = tokenize(a_text)
    for token in q_tokens + a_tokens:
        if token and token not in vocab:
            vocab[token] = len(vocab)

In [None]:
df.apply

In [162]:
# Build vocab by iterating over dataframe rows
for _, row in df.iterrows():
    build_vocab_from_row(row)

print('Vocab size:', len(vocab))

Vocab size: 1


In [158]:
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        indexed_text.append(vocab.get(token, vocab.get('<UNK>', 0)))
    return indexed_text

In [161]:
text_to_indices("What is your name?", vocab)

[0, 0, 0, 0]

In [None]:
# Inspect dataframe structure to find why vocab remained empty
print('Columns:', df.columns.tolist())
print('\nDf preview:')
print(df.head(5).to_string(index=False))
if 'Question' in df.columns and 'Answer' in df.columns:
    print('\nSample Question types:', df['Question'].apply(lambda x: type(x)).value_counts().to_dict())
    print('Sample Answer types:', df['Answer'].apply(lambda x: type(x)).value_counts().to_dict())
    print('\nFirst Question raw repr:')
    print(repr(df['Question'].iloc[0]))
    print('\nFirst Answer raw repr:')
    print(repr(df['Answer'].iloc[0]))
else:
    print('\nExpected columns not found; printed columns above')