In [None]:
import pandas as pd
import re

def preprocess_hindi_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove hashtags and the text following them
    text = re.sub(r'#\S+', '', text)

    # Remove mentions/tags like @username
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Additional cleanup: Remove multiple spaces and trim leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Read your dataset (replace 'your_dataset.csv' with your actual dataset file)
dataset = pd.read_excel('constraint_Hindi_Train.xlsx')

# Preprocess the Hindi text in the 'Post' column
dataset['preprocessed_text'] = dataset['Post'].apply(preprocess_hindi_text)

# Save the preprocessed dataset to a new Excel file
dataset.to_excel('preprocessed_dataset.xlsx', index=False)

# Display the first few rows of the preprocessed dataset
print(dataset[['Post', 'preprocessed_text']].head())


                                                Post  \
0  मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...   
1  सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...   
2  सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...   
3  @prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...   
4  #unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...   

                                   preprocessed_text  
0  मर दश क हनद बहत नरल ह कछ त पकक रम भकत ह और कछ ...  
1  सरकर हमश स कसन क कमई क बढन क लए नईनई सकम लत रह...  
2  सशत न ज बजनस डल जन क क थ व डल दपश क सशत क हतय ...  
3  सल जएनय छप कमन लग हनदओ क यह कहत ह क सवधन सबक ब...  
4  अनलक क लए गइडलइनस जर सतबर स दशभर म मटर सव शर ह...  


## Installing the python library, that is wrapper around IndicXlit model

In [None]:
import tensorflow as tf

# Check if GPU is available
if tf.test.gpu_device_name():
    print("GPU is available")
    # Explicitly set a GPU device if multiple GPUs are available
    # For example, use the first GPU (device: 0)
    # You can change the device number as needed
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
else:
    print("GPU is not available")

GPU is available


In [None]:
# installing library
# for thorough documentation: https://pypi.org/project/ai4bharat-transliteration/
!pip install ai4bharat-transliteration

Collecting ai4bharat-transliteration
  Downloading ai4bharat_transliteration-1.1.3-py3-none-any.whl (32 kB)
Collecting pydload (from ai4bharat-transliteration)
  Downloading pydload-1.0.9-py2.py3-none-any.whl (16 kB)
Collecting flask-cors (from ai4bharat-transliteration)
  Downloading Flask_Cors-4.0.0-py2.py3-none-any.whl (14 kB)
Collecting gevent (from ai4bharat-transliteration)
  Downloading gevent-23.9.1-cp310-cp310-manylinux_2_28_x86_64.whl (6.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses (from ai4bharat-transliteration)
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ujson (from ai4bharat-transliteration)
  Downloading ujson-5.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

## Import the module for transliteration engine

In [None]:
# model support the following languages : [as, bn, brx, gom, gu, hi, kn, ks, mai, ml, mni, mr, ne, or, pa, sa, sd, si, ta, te, ur]
# importing ai4bharat transliteration module
from ai4bharat.transliteration import XlitEngine

## Using word Transliteration

- beam_width increases beam search size, resulting in improved accuracy but increases time/compute. (Default: 4)
- topk returns only specified number of top results. (Default: 4)
- rescore returns the reranked suggestions after using a dictionary. (Default: True)

#### En-Indic conversion

In [None]:
# intializing the en-indic multilingual model and dictionaries (if rerank option is True)
e = XlitEngine("hi", beam_width=10, rescore=True, src_script_type = "en")

# transliterate word
out = e.translit_word("How are you", topk=1)
print(out)

Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


{'hi': ['how are यू']}


#### Indic-En conversion

In [None]:
# intializing the indic-en multilingual model and dictionaries (if rerank option is True)
e = XlitEngine( beam_width=4, rescore=False, src_script_type = "indic")

# transliterate Hindi word
out = e.translit_word("भारत", 'hi', topk=1)
print(out)

# transliterate Gujarati word
out = e.translit_word("ગુજરાત", 'gu', topk=5)
print(out)

Initializing Multilingual model for transliteration
['bhaarat']
['gujaraat', 'gujarat', 'goojarat', 'gujraat']


## word Transliteration without rescoring

#### En-Indic conversion

In [None]:
e = XlitEngine("hi", beam_width=4, rescore=False, src_script_type = "en")
out = e.translit_word("one", topk=5)
print(out)

Initializing Multilingual model for transliteration
{'hi': ['ओने', 'ओन', 'ओनी', 'ओणे']}


#### Indic-En conversion

In [None]:
# intializing the indic-en multilingual model and dictionaries (if rerank option is True)
e = XlitEngine( beam_width=10, rescore=False, src_script_type = "indic")

# transliterate Hindi word
out = e.translit_word("भारत", 'hi', topk=5)
print(out)

Initializing Multilingual model for transliteration
['bhaarat', 'bharat', 'bharath', 'bharata', 'bhaarut']


#### En-Indic conversion

In [None]:
e = XlitEngine(["te", 'mr'], beam_width=10, src_script_type = "en")
out = e.translit_sentence("102 VAnakkam ulagam")
print(out)

Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|██████████| 2/2 [00:20<00:00, 10.22s/it]


{'mr': '१०२ वणक्कम उलगम', 'te': '౧౦౨ వణక్కం ఉలగం'}


#### Indic-En conversion

In [None]:
e = XlitEngine( beam_width=4, src_script_type = "indic")
out = e.translit_sentence("వణక్కం ఉలగం", 'te')
print(out)

Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|██████████| 1/1 [00:00<00:00,  8.66it/s]


vanakkam ulagam


## Using Multiple language Transliteration

In [None]:
# Pass list of languages for multile language transliteration
e = XlitEngine(["ta", "ml"], beam_width=6, src_script_type = "en")
# leave empty or use "all" to load all available languages
# e = XlitEngine("all)

out = e.translit_word("amma", topk=3)
print(out)

out = e.translit_sentence("hello world")
print(out)

## Specify language name to get only specific language result
out = e.translit_word("amma", topk=5)
print(out)

Initializing Multilingual model for transliteration


Loading dicts into RAM:   0%|          | 0/2 [00:00<?, ?it/s]

## Transliteration for all available languages

In [None]:
# loading all the language dictionaries would require 8-10 gb of space in RAM
e = XlitEngine(beam_width=10, src_script_type = "en")
out = e.translit_sentence("Hello World")
print(out)

Initializing Multilingual model for transliteration


Loading dicts into RAM:  14%|█▍        | 3/21 [00:06<00:36,  2.01s/it]

In [None]:
!pip install tdqm

In [None]:
import pandas as pd
from ai4bharat.transliteration import XlitEngine
from tqdm import tqdm

# Define batch size
batch_size = 32  # You can adjust this value as needed

# Initialize the transliteration engine for Hindi
e = XlitEngine("hi", beam_width=10, rescore=True, src_script_type="en")

# Load your dataset from your computer
input_file_path = 'TWITTER+youtube(ML Project).csv'
output_file_path = 'output_dataset.csv'

# Load the dataset
df = pd.read_csv(input_file_path, encoding='latin1')

# Create a new column for transliterated text
df['transliterated_text'] = ''

# Create a tqdm progress bar for iteration
total_batches = (len(df) + batch_size - 1) // batch_size
progress_bar = tqdm(total=total_batches, position=0, leave=True)

# Iterate through the dataset in batches and transliterate
for batch_start in range(0, len(df), batch_size):
    batch_end = min(batch_start + batch_size, len(df))
    batch_df = df.iloc[batch_start:batch_end]

    for index, row in batch_df.iterrows():
        input_text = row['Tweet']

        # Check if input_text is a string, and convert it to string if not
        if not isinstance(input_text, str):
            input_text = str(input_text)

        transliterated_text = e.translit_sentence(input_text)
        df.at[index, 'transliterated_text'] = transliterated_text

    # Update the progress bar
    progress_bar.update(1)
    progress_bar.set_description(f"Progress: {progress_bar.n}/{total_batches}")

# Close the progress bar
progress_bar.close()

# Save the updated dataset to a new .csv file
df.to_csv(output_file_path, index=False)

Initializing Multilingual model for transliteration


Loading dicts into RAM: 100%|██████████| 1/1 [00:07<00:00,  7.65s/it]
Progress: 178/178: 100%|██████████| 178/178 [2:34:09<00:00, 51.96s/it]
