In [29]:
import re
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Function to detect Hindi words more accurately
def is_hindi_word(word):
    # Define common Hindi characters and patterns
    hindi_vowels = "aāiīuūṛeēoōaiau"
    hindi_consonants = "kghcjñṭḍnpbmśṣsṛḷvyh"  # Basic consonant set
    hindi_characters = hindi_vowels + hindi_consonants

    # Check if the word contains at least 50% Hindi characters
    hindi_char_count = sum(1 for char in word.lower() if char in hindi_characters)
    return hindi_char_count > 0.5 * len(word)

# Function to preprocess Hinglish text
def preprocess_hinglish(hinglish_sentence):
    words = hinglish_sentence.split()
    processed_words = []

    for word in words:
        if is_hindi_word(word):  # If the word is likely Hindi in Latin script
            try:
                # Transliterate to Devanagari script
                devanagari_word = transliterate(word, sanscript.ITRANS, sanscript.DEVANAGARI)
                processed_words.append(devanagari_word)
            except Exception:
                # If transliteration fails, keep the original word
                processed_words.append(word)
        else:
            # Keep English words as they are
            processed_words.append(word)

    # Join back into a sentence
    return " ".join(processed_words)

# Example Hinglish text
hinglish_sentence = "Mujhe coding karna pasand hai, aur machine learning mein interest hai."
processed_sentence = preprocess_hinglish(hinglish_sentence)

print("Original Sentence:", hinglish_sentence)
print("Processed Sentence:", processed_sentence)


Original Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.
Processed Sentence: ंउझे चोदिन्ग् कर्न पसन्द् है, और् मचिने लेअर्निन्ग् मेइन् इन्तेरेस्त् है।


In [30]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Improved function to preprocess Hinglish text
def preprocess_hinglish(hinglish_sentence):
    def clean_word(word):
        # Normalize text: lowercase, strip punctuation, and fix common issues
        word = word.lower()
        word = word.strip(".,!?")  # Remove trailing punctuation
        return word

    words = hinglish_sentence.split()
    processed_words = []

    for word in words:
        cleaned_word = clean_word(word)
        try:
            # Transliterate to Devanagari script
            devanagari_word = transliterate(cleaned_word, sanscript.ITRANS, sanscript.DEVANAGARI)
            processed_words.append(devanagari_word)
        except Exception:
            # If transliteration fails, keep the original word
            processed_words.append(cleaned_word)

    # Join back into a sentence
    return " ".join(processed_words)

# Example Hinglish text
hinglish_sentence = "Mujhe coding karna pasand hai, aur machine learning mein interest hai."
processed_sentence = preprocess_hinglish(hinglish_sentence)

print("Original Sentence:", hinglish_sentence)
print("Processed Sentence:", processed_sentence)


Original Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.
Processed Sentence: मुझे चोदिन्ग् कर्न पसन्द् है और् मचिने लेअर्निन्ग् मेइन् इन्तेरेस्त् है


In [31]:
pip install langdetect indic-transliteration



In [32]:
from langdetect import detect_langs
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Function to detect if the word is Hindi based on detection probability
def is_hindi_word(word):
    try:
        # Detect the probabilities for multiple languages
        lang_prob = detect_langs(word)

        # If the probability for Hindi is higher than 0.5 (or any threshold you prefer)
        if lang_prob.get('hi', 0) > 0.5:
            return True
        return False
    except Exception as e:
        return False

# Function to preprocess Hinglish text
def preprocess_hinglish(hinglish_sentence):
    words = hinglish_sentence.split()  # Tokenize sentence into words
    processed_words = []

    for word in words:
        # Clean word of any trailing punctuation
        cleaned_word = ''.join([char for char in word if char.isalnum() or char.isspace()])

        # Check if the word is Hindi or English
        if is_hindi_word(cleaned_word):  # If the word is detected as Hindi
            # Transliterate Hindi words to Devanagari script
            transliterated_word = transliterate(cleaned_word, sanscript.ITRANS, sanscript.DEVANAGARI)
            processed_words.append(transliterated_word)
        else:
            # Leave English words as they are
            processed_words.append(word)

    # Join the processed words back into a sentence
    return " ".join(processed_words)

# Example Hinglish text
hinglish_sentence = "Mujhe coding karna pasand hai, aur machine learning mein interest hai."
processed_sentence = preprocess_hinglish(hinglish_sentence)

print("Original Sentence:", hinglish_sentence)
print("Processed Sentence:", processed_sentence)


Original Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.
Processed Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.


In [33]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

# Predefined list of common Hindi words for this example
hindi_words_set = set([
    'mujhe', 'karna', 'pasand', 'hai', 'mein', 'aur', 'tum', 'kya', 'ho', 'kaise', 'sab', 'acha', 'jaldi', 'main'
])

# Function to preprocess Hinglish text
def preprocess_hinglish(hinglish_sentence):
    words = hinglish_sentence.split()  # Tokenize sentence into words
    processed_words = []

    for word in words:
        cleaned_word = ''.join([char for char in word if char.isalnum()])  # Clean punctuation
        if cleaned_word.lower() in hindi_words_set:
            # Transliterate to Devanagari
            transliterated_word = transliterate(cleaned_word, sanscript.ITRANS, sanscript.DEVANAGARI)
            processed_words.append(transliterated_word)
        else:
            # Keep English words as they are
            processed_words.append(word)

    # Join the processed words back into a sentence
    return " ".join(processed_words)

# Example Hinglish text
hinglish_sentence = "Mujhe coding karna pasand hai, aur machine learning mein interest hai."
processed_sentence = preprocess_hinglish(hinglish_sentence)

print("Original Sentence:", hinglish_sentence)
print("Processed Sentence:", processed_sentence)


Original Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.
Processed Sentence: ंउझे coding कर्न पसन्द् है और् machine learning मेइन् interest है


In [34]:
import string

# Predefined list of common Hindi words for this example
hindi_words_set = set([
    'mujhe', 'karna', 'pasand', 'hai', 'mein', 'aur', 'tum', 'kya', 'ho', 'kaise', 'sab', 'acha', 'jaldi', 'main'
])

# Function to count and categorize Hindi and English words
def count_and_categorize_words(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    hindi_count = 0
    english_count = 0
    hindi_words = []
    english_words = []

    # Iterate over each word
    for word in words:
        # Remove punctuation from the word
        cleaned_word = word.strip(string.punctuation).lower()  # Clean punctuation and convert to lowercase

        # Avoid empty words
        if not cleaned_word:
            continue

        # Check if it's a Hindi word or an English word
        if cleaned_word in hindi_words_set:
            hindi_count += 1
            hindi_words.append(word)  # Add original word with punctuation
        else:
            english_count += 1
            english_words.append(word)  # Add original word with punctuation

    return hindi_count, english_count, hindi_words, english_words

# Example Hinglish text
hinglish_sentence = "Mujhe coding karna pasand hai, aur machine learning mein interest hai."
hindi_count, english_count, hindi_words, english_words = count_and_categorize_words(hinglish_sentence)

# Print results
print(f"Original Sentence: {hinglish_sentence}")
print(f"Hindi Word Count: {hindi_count}")
print(f"English Word Count: {english_count}")
print(f"Hindi Words: {hindi_words}")
print(f"English Words: {english_words}")


Original Sentence: Mujhe coding karna pasand hai, aur machine learning mein interest hai.
Hindi Word Count: 7
English Word Count: 4
Hindi Words: ['Mujhe', 'karna', 'pasand', 'hai,', 'aur', 'mein', 'hai.']
English Words: ['coding', 'machine', 'learning', 'interest']
