In [None]:
!pip install -q transformers datasets sentencepiece sacrebleu evaluate accelerate kaggle


from google.colab import drive
drive.mount('/content/drive')


# Create a working directory inside Drive so models/data persist
import os
WORKDIR = '/content/drive/MyDrive/mt_en_ur'
os.makedirs(WORKDIR, exist_ok=True)
print('Working dir:', WORKDIR)

In [None]:
import shutil
if os.path.exists('/content/kaggle.json'):
  os.makedirs('/root/.kaggle', exist_ok=True)
  shutil.copy('/content/kaggle.json','/root/.kaggle/kaggle.json')
  os.chmod('/root/.kaggle/kaggle.json', 0o600)


# Download the Kaggle dataset (Parallel Corpus for English-Urdu Language)
# dataset slug: zainuddin123/parallel-corpus-for-english-urdu-language


!kaggle datasets download -d zainuddin123/parallel-corpus-for-english-urdu-language -p {WORKDIR} --unzip


# After download, inspect files
!ls -la {WORKDIR}

In [None]:
import pandas as pd
import glob


# Try to find a CSV or TXT in the dataset folder
candidates = glob.glob(WORKDIR + '/*/*') # Look inside subdirectories too
print('candidates:', candidates)


# Adjust the filename below if different
# Many Kaggle uploads contain a file 'parallel-corpus-en-ur.csv' or similar
csv_files = [p for p in candidates if p.lower().endswith('.csv')]
if len(csv_files)==0:
  csv_files = [p for p in candidates if p.lower().endswith('.tsv')]
if len(csv_files)==0:
  csv_files = [p for p in candidates if p.lower().endswith('.txt')]


print('CSV files found:', csv_files)


if not csv_files:
  raise FileNotFoundError('No CSV/TSV/TXT found in dataset directory. Check files list above and update filename in cell.')


# The text files contain one sentence per line, with English and Urdu in separate files.
# We need to load both files and combine them into a single dataframe.
if len(csv_files) == 2 and all(f.lower().endswith('.txt') for f in csv_files):
    en_file = [f for f in csv_files if 'english' in f.lower()][0]
    ur_file = [f for f in csv_files if 'urdu' in f.lower()][0]

    with open(en_file, 'r', encoding='utf-8') as f:
        en_lines = f.readlines()
    with open(ur_file, 'r', encoding='utf-8') as f:
        ur_lines = f.readlines()

    # Create dataframe from the two lists
    df = pd.DataFrame({'en': en_lines, 'ur': ur_lines})

else:
  # If not two text files, assume it's a single CSV/TSV with columns to identify
  df = pd.read_csv(csv_files[0], encoding='utf-8', error_bad_lines=False)

  # You need to identify which columns are English and Urdu; common names: 'english','urdu' or 'en','ur'
  # Attempt common guesses
  possible_src = [c for c in df.columns if 'en' in c.lower() or 'english' in c.lower()]
  possible_tgt = [c for c in df.columns if 'ur' in c.lower() or 'urdu' in c.lower()]
  print('possible_src', possible_src)
  print('possible_tgt', possible_tgt)


  src_col = possible_src[0] if possible_src else df.columns[0]
  tgt_col = possible_tgt[0] if possible_tgt else df.columns[1]
  print('Using columns:', src_col, tgt_col)


  # Keep only those two columns and drop NA rows
  df = df[[src_col, tgt_col]].dropna()
  df.columns = ['en','ur']


print(df.shape)
print(df.columns)
print(df.head())

In [None]:
import re
def clean_text_en(s):
  s = s.lower()
  s=s.strip()
  s = re.sub(r"\s+"," ",s)
  return s

def clean_text_ur(s):
  s=s.lower()
  s = str(s).strip()
  s = re.sub(r"\s+"," ",s)
  return s

_df = df.copy()
_df['en'] = _df['en'].apply(clean_text_en)
_df['ur'] = _df['ur'].apply(clean_text_ur)


# Drop empty lines
_df = _df[(_df['en']!='') & (_df['ur']!='')]
print('Pairs after cleaning:', len(_df))


# Save a master file and show some examples
_master = os.path.join(WORKDIR,'parallel_master.csv')
_df.to_csv(_master, index=False)
print('Saved master csv to', _master)
_df.head(10)

In [None]:
from sklearn.model_selection import train_test_split


train_val, test_df = train_test_split(_df, test_size=0.10, random_state=42)
train_df, val_df = train_test_split(train_val, test_size=0.1111, random_state=42) # ~80/10/10


print('train, val, test sizes:', len(train_df), len(val_df), len(test_df))


for name,dfpart in [('train',train_df),('valid',val_df),('test',test_df)]:
  en_path = os.path.join(WORKDIR,f'{name}.en')
  ur_path = os.path.join(WORKDIR,f'{name}.ur')
  dfpart['en'].to_csv(en_path, index=False, header=False)
  dfpart['ur'].to_csv(ur_path, index=False, header=False)
  print('Saved:', en_path, ur_path)