### Install Libraries and Imports

In [1]:
# Install the contextualized topic model library
!pip install -U contextualized_topic_models

!pip install pyldavis
!pip install wget
!pip install head
!nvidia-smi

# Setup Hindi for analysis
!pip install indic-nlp-library==0.81
!pip install stopwordsiso
!pip install inltk
!pip install regex
!pip install urduhack

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mSun May 14 22:55:52 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.50                 Driver Version: 531.79       CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1660         On | 00000000:2B:00.0  On |                  N/A |
|  0%   51C    P8             

## Data

**Building PMIndia Corpus**

Below contains the code for creating a parallel corpus from the website of the Indian Prime Minister (www.pmindia.gov.in). 

We combine each speech document into one, for every language. Datasets are downloaded from [Statistical Machine Translation](https://data.statmt.org/pmindia/v1/monolingual/).

*Downloading Dataset*

In [2]:
# Imports
import urllib
import wget
import pandas as pd
import os
import pickle

In [3]:
# Build directory to store speech files
!mkdir parallel_speeches

In [4]:
# Download PMIndia Datasets
FILES_DIR = os.getcwd() # REPLACE WITH YOUR DIRECTORY IF YOU PREFER DOWNLOADING IN SPECIFIC DIRECTORY
LINK = "https://drive.google.com/u/0/uc?id=1IqH2XQFw1XHPT2Sh_Yz3LnrMEVqS8oef&export=download"  # LINK TO PMINDIA FILE LOCATION
FILE_PATH = os.getcwd() + "/" + wget.download(LINK) # PATH TO DOWNLOADS

pmindia_list = pd.read_csv(FILE_PATH, sep = ",",  names = ["lang", "link"], engine = "python" )['link']
pmindia_list = [FILES_DIR + "/" + wget.download(link.strip(" ")) for link in pmindia_list]

100% [..........................................................................] 2725404 / 2725404

In [5]:
# Files stored in content directory
# Data will need to be re-downloaded if a session closes
''' Following script will download parallel corpus into new directory named parallel_speeches.
Each folder in parallel_speeches contains pmindia speeches in indic language identified by their ISO code.

For e.g.
parallel_speeches/as/ contains speeches in Assamese.
parallel_speeches/hi contains speeches in Hindi.
'''
import tarfile
import os

FILES_DIR = os.getcwd() # REPLACE WITH DIRECTORY OF FILES
STORE_FILES_HERE = os.getcwd() + "/parallel_speeches" # STORE EXTRACTED FILES HERE

for fname in pmindia_list:
    tar = tarfile.open(fname, "r:gz") # unzip file
    tar.extractall(STORE_FILES_HERE)    
    foldername = fname[fname.rfind('.tgz')-2: fname.rfind('.tgz')] # get folder name
    os.rename(STORE_FILES_HERE + "/split", STORE_FILES_HERE + "/" + foldername) # rename default split folder to language name
    tar.close()

*Get Parallel Speeches*

In [6]:
import glob # SEEK FILES FROM ABOVE FOR DOWNLOAD
# FOLDERS_DIR = os.getcwd() + "/parallel_speeches"  # DIRECTORY FOR SAVING PARALLEL SPEECHES

# Stores list of speeches in dictionary keyed by ISO language name
SPEECHES_IN_LANGS = {languagefolder[-2:] : sorted(list(glob.glob(languagefolder + "/*.txt")))[3:] for languagefolder in glob.glob(STORE_FILES_HERE + "/*")}

In [7]:
# Get file names to find common test corpus (set of files common between all languages)
STRIP_INDEX = SPEECHES_IN_LANGS['as'][0].index('as') + len('as/')   # GET LANGUAGE ISO CODE

filenames = {}
for lang in SPEECHES_IN_LANGS.keys():
  temp = [item[STRIP_INDEX:] for item in SPEECHES_IN_LANGS[lang]]
  filenames[lang] = temp

*Combine Parallel Speeches Into Corpus*

In [8]:
# Setting seed for reproducibility
import random
random.seed(210)

# Select 800 random corpus
sample_corpus = sorted(random.sample(list(set.intersection(*map(set, filenames.values()))), 800))
pd.DataFrame(sample_corpus, columns = ["Speech File Name"]).head()

Unnamed: 0,Speech File Name
0,15th-edition-of-pravasi-bharatiya-diwas-inaugu...
1,ambassador-ms-nikki-haley-united-states-perman...
2,anganwadi-workers-from-across-the-country-call...
3,asha-representatives-from-across-the-country-c...
4,beneficiaries-of-pradhan-mantri-mudra-yojana-s...


In [9]:
# NORMALIZE HINDI TEXTS
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize.indic_tokenize import trivial_tokenize
import urduhack 

def normalize(sent, lang = 'hi'):  
  normalizer_factory = IndicNormalizerFactory()
  normalizer = normalizer_factory.get_normalizer(lang)
  normalized = normalizer.normalize(sent)
  return normalized

2023-05-14 22:57:28.162713: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [10]:
# Combine selected 800 sample speeches into one document for each lang 
'''04/24 fix: decode issues, speech file now opened as binary rb'''

parallel_speeches = {}    # STORE TEST SPEECHES
train_speeches = {}       # STORE TRAIN SPEECHES

for lang, speeches in SPEECHES_IN_LANGS.items(): 
  each_lang = []         # list of speeches for one language at a time
  just_train = []
  
  for speech_file in speeches:                          # access list of files in speeches for one language at a time
    with open(speech_file, 'rb') as speech:              # read file
      speech = " ".join([str(line.decode("utf-8", errors="ignore")) for line in speech]) # each speech file becomes one string

      
      if lang != 'en' and lang != 'ur':                 # normalize each speech per language
        normalize(speech, lang)

      if speech_file[STRIP_INDEX:] in sample_corpus:
        each_lang.append(speech)                        # append string version of speech file
      else:
        just_train.append(speech)                       # add to train set

  parallel_speeches[lang] = each_lang                   # add list of speeches for every language
  train_speeches[lang] = just_train                     # add to train set

train_speeches['hi'] = [normalize(list) for list in train_speeches['hi']]

In [12]:
# Save files as pickle object for training
with open('train_speeches.pkl', 'wb') as train_s:
    pickle.dump(train_speeches, train_s)
    
with open('parallel_speeches.pkl', 'wb') as test_s:
    pickle.dump(parallel_speeches, test_s)

In [14]:
import json

# Suppose we have the following dictionary
# Open a file for writing
with open('train_speeches.txt', 'w', encoding='utf-8') as file:
    file.write(json.dumps(train_speeches, indent=4))

with open('parallel_speeches.txt', 'w', encoding='utf-8') as file:
    file.write(json.dumps(parallel_speeches, indent=4))

### *Split Parallel Corpus Into Test and Train*

In [15]:
# Imports
import pandas as pd
from pprint import pprint
import re

# Selecting Train speeches
hindi_unprep = pd.DataFrame(list for list in train_speeches['hi'])
english_unprep = pd.DataFrame(list for list in train_speeches['en'])

# View each row as a speech
print(hindi_unprep[:5])
print(english_unprep[:5])

# n = 4003
print("\nHindi count:" + str(hindi_unprep.count()) + "\n")
# n = 4489
print("English count:" + str(english_unprep.count()) + "\n")

                                                   0
0  नागरिकों से स्वच्छाग्रही बनने और स्वच्छ भारत ब...
1  श्री सोमनाथ न्यास के न्यासियों की 116वीं बैठक ...
2  प्रधानमंत्री श्री नरेन्द्र मोदी की अध्यक्षता म...
3  प्रधानमंत्री श्री नरेन्द्र मोदी ने आज ही के दि...
4  · खूंटी की जिला अदालत में छत पर लगने वाले सौर ...
                                                   0
0  PM Calls upon citizens to become Swachhagrahis...
1  The first 10 months of Prime Minister Narendra...
2  BRICS in Africa: Collaboration for Inclusive G...
3  The 116th meeting of the trustees of Shri Somn...
4  Deendayal Upadhyaya Gram Jyoti Yojana\n The Un...

Hindi count:0    4003
dtype: int64

English count:0    4489
dtype: int64

