In [3]:
import pandas as pd
import re
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize
from spellchecker import SpellChecker
from Levenshtein import distance as levenshtein_distance

In [2]:
!pip install indic-nlp-library
!pip install pyspellchecker
!pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected package

In [9]:
!wget -O hindi_dictionary.txt https://raw.githubusercontent.com/AI4Bharat/indicnlp_corpus/master/hi/hi_words.txt


--2025-09-07 11:06:58--  https://raw.githubusercontent.com/AI4Bharat/indicnlp_corpus/master/hi/hi_words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2025-09-07 11:06:58 ERROR 404: Not Found.



In [4]:
df = pd.read_excel("/content/Unique Words Data (1).xlsx")  # should contain one column 'word'
words = df['word'].astype(str).tolist()

In [5]:
factory = IndicNormalizerFactory()
normalizer = factory.get_normalizer("hi")

In [6]:
def normalize_word(word):
    return normalizer.normalize(word.strip())

words = [normalize_word(w) for w in words]

In [7]:
spell = SpellChecker(language=None, case_sensitive=True)


In [10]:
with open("/content/hindi_dictionary.txt", "r", encoding="utf-8") as f:
    hindi_dict = set([normalize_word(w.strip()) for w in f.readlines()])


In [11]:
results = []
for w in words:
    if w in hindi_dict:
        results.append((w, "correct spelling"))
    else:
        # Approximate check with Levenshtein distance
        suggestions = [d for d in hindi_dict if levenshtein_distance(w, d) <= 2]
        if suggestions:
            results.append((w, "incorrect spelling"))
        else:
            # Assume rare words / loan words as correct
            results.append((w, "correct spelling"))


In [12]:
out_df = pd.DataFrame(results, columns=["word", "status"])
out_df.to_csv("hindi_spelling_check.csv", index=False)
print("Done! Output saved to hindi_spelling_check.csv")

Done! Output saved to hindi_spelling_check.csv
