In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("dataset/premchand.tsv", sep="\t")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Author Name    167 non-null    object
 1   Type of Work   167 non-null    object
 2   Title of Work  167 non-null    object
 3   Text           167 non-null    object
dtypes: object(4)
memory usage: 5.3+ KB


In [3]:
text_col = "Text" if "Text" in df.columns else df.columns[-1]  
raw_texts = df[text_col].dropna().astype(str).tolist()


In [4]:
def clean_text(t):
    # Remove Roman numerals or standalone numbers
    t = re.sub(r"\b\d+\b", " ", t)
    # Remove Devanagari digits (०१२३४५६७८९)
    t = re.sub(r"[०१२३४५६७८९]+", " ", t)
    # Remove weird symbols like » or decorative marks
    t = re.sub(r"[»“”‘’]", " ", t)
    # Collapse multiple spaces/newlines
    t = re.sub(r"\s+", " ", t)
    # Strip leading/trailing spaces
    return t.strip()

In [5]:
cleaned_texts = [clean_text(t) for t in raw_texts]

final_text = "\n\n".join(cleaned_texts)

MAX_CHARS = 1_200_000  # ~1.2M, a bit larger than Tiny Shakespeare

if len(final_text) > MAX_CHARS:
    final_text = final_text[:MAX_CHARS]


In [6]:
with open("dataset/input.txt", "w", encoding="utf-8") as f:
    f.write(final_text)

print(f"Total characters in cleaned text: {len(final_text)}")

Total characters in cleaned text: 1200000
