In [150]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [164]:
df = pd.read_csv("/home/darshan39/Downloads/100_Unique_QA_Dataset.csv")
# normalize column names and fill missing question/answer cells
df.columns = df.columns.str.strip()
if 'Question' in df.columns and 'Answer' in df.columns:
    # coerce to string and strip whitespace so tokenizer always receives strings
    df['Question'] = df['Question'].fillna('').astype(str).str.strip()
    df['Answer'] = df['Answer'].fillna('').astype(str).str.strip()
else:
    # show columns for debugging if expected columns not present
    print('Columns in dataframe:', df.columns.tolist())

Columns in dataframe: ['question', 'answer']


In [165]:
df.head()

Unnamed: 0,question,answer
0,What is the capital of France?,Paris
1,What is the capital of Germany?,Berlin
2,Who wrote 'To Kill a Mockingbird'?,Harper-Lee
3,What is the largest planet in our solar system?,Jupiter
4,What is the boiling point of water in Celsius?,100


In [166]:
# Robust tokenizer that tolerates non-string inputs
import re
def tokenize(text):
    # ensure we work with a string and remove common punctuation
    if text is None:
        text = ''
    text = str(text)
    # replace question marks/apostrophes with space and remove other punctuation
    text = text.replace('?', ' ').replace("'", ' ')
    text = re.sub(r'[^0-9a-zA-Z\s]', ' ', text)
    return text.lower().strip().split()

In [167]:
tokenize("What's your name?")

['what', 's', 'your', 'name']

In [168]:
# vocabulary initialization (ensure single source of truth)
vocab = {'<UNK>': 0}

In [169]:
# Build vocabulary from dataframe rows (robust to missing columns/values)
def build_vocab_from_row(row):
    # Row can be a Series; extract safely and coerce to string
    q_val = row.get('Question', '') if hasattr(row, 'get') else ''
    a_val = row.get('Answer', '') if hasattr(row, 'get') else ''
    q_text = '' if pd.isna(q_val) else str(q_val).strip()
    a_text = '' if pd.isna(a_val) else str(a_val).strip()
    q_tokens = tokenize(q_text)
    a_tokens = tokenize(a_text)
    for token in q_tokens + a_tokens:
        if token and token not in vocab:
            vocab[token] = len(vocab)

In [170]:
# (previous stray df.apply removed) - keep as a placeholder for future checks
print('df.apply placeholder — previous problematic call removed')

df.apply placeholder — previous problematic call removed


In [171]:
# Build vocab by iterating over dataframe rows
for _, row in df.iterrows():
    build_vocab_from_row(row)

print('Vocab size:', len(vocab))

Vocab size: 1


In [172]:
def text_to_indices(text, vocab):
    indexed_text = []
    for token in tokenize(text):
        indexed_text.append(vocab.get(token, vocab.get('<UNK>', 0)))
    return indexed_text

In [173]:
text_to_indices("What is your name?", vocab)

[0, 0, 0, 0]

In [None]:
# Quick inspection of dataframe to debug tokenization/vocab
print('Columns:', df.columns.tolist())
print('\nData sample:')
print(df.head(5).to_string(index=False))
if 'Question' in df.columns and 'Answer' in df.columns:
    print('\nTypes in Question column:', df['Question'].apply(lambda x: type(x)).value_counts().to_dict())
    print('Types in Answer column:', df['Answer'].apply(lambda x: type(x)).value_counts().to_dict())
    print('\nFirst question repr:', repr(df['Question'].iloc[0]))
    print('First answer repr:', repr(df['Answer'].iloc[0]))