In [17]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'spelling:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1310%2F2365%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T110022Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1de998989d533d1d36835dbdb904b694c6aa6133629cf4061b6d8ea5da0804336c6555835e0bea9ae4e07e3f6e2cd2c55620cab8713c7ef6d46e7d8a5defcdd4bdaa6e54bd214ac5868f40f1f1bab34428bc78e9ea7fbb3b77f80da2fb01ef3e56a1017fd8e92ec1afae3598a23ceb43e569519d1b9372585fceebe909ddfad3172656f58f4c338152c5d3455917cdfebaee4426b7cef709f05861358cdad9a601e43cee8e52b76acbedde4b67a807b54c5579a18972d9f8e5afc30b5847baf8e7c164ee06dbef6f32139c27fbb97dafcabe5f129e0ac1871be9224a8830eb449cc7146a334cfa040aa648fffb461dc896d16143a243d0ebfdd5e4f280beab62,a2data1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F616848%2F1102281%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240610%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240610T110022Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7ccf2c4c1adf950e2c22db1715b8fc7f0ba0d6fb8fae7f4399645fcac0a631053ceff42ab2899fd4c1cf1bd21520347786e02f40e21f0c97d2aaa832edac7eaddf3c9c4a029788e6e8ab0265d6c5dfb3a0ebd8641deaa33e153fb4f9b79bb03c64726afe570411f8b47a90feaecdd4acf35333d78461fa27027b80aa79d7a734d5efcadaa6bc608eefc34ed024917da90c4c0534319d30675fafead85e77f98aba61c5d851d603e63513f9aebab1f8da3737d2b746066dc97a5b6aff9189ca57c4dd03c254a40cf6b41eb6728fa6006a222cae437a858ca50fd8b31303ccda2aa67cab54f4dba9394267daa7178560d39ca28dbae5e0d6e47d185593be85d7c0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading spelling, 2575411 bytes compressed
Downloaded and uncompressed: spelling
Downloading a2data1, 231943 bytes compressed
Downloaded and uncompressed: a2data1
Data source import complete.


In [18]:
import re
from collections import Counter

In [19]:
def words(text): return re.findall(r'\w+', text.lower())

sherlock = Counter(words(open('/kaggle/input/a2data1/sherlock.txt').read()))
testset1 = Counter(words(open('/kaggle/input/a2data1/test.txt').read()))
aspell = Counter(words(open('/kaggle/input/spelling/aspell.txt').read()))
big = Counter(words(open('/kaggle/input/spelling/big.txt').read()))
birkbeck = Counter(words(open('/kaggle/input/spelling/birkbeck.txt').read()))
testset2 = Counter(words(open('/kaggle/input/spelling/spell-testset1.txt').read()))
testset3 = Counter(words(open('/kaggle/input/spelling/spell-testset2.txt').read()))
wikipedia = Counter(words(open('/kaggle/input/spelling/wikipedia.txt').read()))
WORDS = sherlock  + aspell + big + birkbeck +  wikipedia
TEST = testset1 + testset2 + testset3


In [20]:
def P(word, N=sum(WORDS.values())):
    #"Probability of `word`."
    return WORDS[word] / N


In [21]:
def correction(word):
    # Generate possible corrections
    candidates_list = candidates(word)

    # Calculate probability for each candidate
    probabilities = {candidate: P(candidate) for candidate in candidates_list}

    # Return the correction with the highest probability
    return max(probabilities, key=probabilities.get)


In [22]:
def candidates(word):
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])


In [23]:
def known(words):
    ## "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)


In [24]:
def edits1(word):
    #"All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


In [25]:
def edits2(word):
    #"All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [26]:
print(correction('corrrectud'))

corrected


In [27]:
def run_tests():
    # Test correction function
    test_correction('korrectud', 'corrected')

In [28]:
def test_correction(input_word, expected_correction):
    result = correction(input_word)
    assert result == expected_correction, f"Expected: {expected_correction}, Got: {result}"
    print(f"Test Passed for '{input_word}': {result}")


In [29]:
run_tests()

Test Passed for 'korrectud': corrected


In [34]:
corrected_text=[]
for input_word in TEST:
        corrected_word = correction(input_word)
        corrected_text.append(corrected_word)

with open("corrected_test", "w") as file:
    # Write each element of the list to the file
    for input_word in TEST:
        file.write(str(correction(input_word)) + " ")


In [None]:
while 1:
  user = input("Enter here any word: ")
  print("Correct word: ",correction(user))

Enter here any word: komputer
Correct word:  computer
Enter here any word: preceise
Correct word:  precise
Enter here any word: passonate
Correct word:  passionate
Enter here any word: goood
Correct word:  good
Enter here any word: bettar
Correct word:  better
