In [4]:
import pandas as pd

# Load datasets
prompts_train = pd.read_csv("../data/prompts_train.csv")
summaries_train = pd.read_csv("../data/summaries_train.csv")

# Drop student_id column from summaries_train and summaries_test
summaries_train = summaries_train.drop(columns=['student_id'])
summaries_train = summaries_train[:500]
id_mapping = {id_val: idx for idx, id_val in enumerate(prompts_train['prompt_id'].unique())}

summaries_train['prompt_id'] = summaries_train['prompt_id'].replace(id_mapping)

In [None]:
# !pip install textblob pyspellchecker
# !pip install symspellpy
# !pip install autocorrect

In [5]:
from symspellpy import SymSpell, Verbosity
from autocorrect import Speller
from textblob import TextBlob
from spellchecker import SpellChecker
from tqdm import tqdm

tqdm.pandas()  # This initializes tqdm for pandas

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "path_to_frequency_dictionary.txt"  # Replace this with your frequency dictionary path
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)


# Initialize AutoCorrect
autocorrect_spell = Speller(lang='en')

def evaluate_textblob(text):
    tb = TextBlob(text)
    corrected_text = tb.correct().string
    errors = sum(1 for orig, corr in zip(text.split(), corrected_text.split()) if orig != corr)
    return errors

def evaluate_pyspellchecker(text):
    spell = SpellChecker()
    misspelled = spell.unknown(text.split())
    return len(misspelled)

def evaluate_symspell(text):
    # Get suggestions
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    # We return the difference between the original and suggested as errors
    errors = sum(1 for orig, corr in zip(text.split(), suggestions[0].term.split()) if orig != corr)
    return errors

def evaluate_autocorrect(text):
    corrected_text = autocorrect_spell(text)
    errors = sum(1 for orig, corr in zip(text.split(), corrected_text.split()) if orig != corr)
    return errors



In [6]:
# Apply the functions and monitor progress
summaries_train['textblob_errors'] = summaries_train['text'].progress_apply(evaluate_textblob)
summaries_train['pyspellchecker_errors'] = summaries_train['text'].progress_apply(evaluate_pyspellchecker)
summaries_train['symspell_errors'] = summaries_train['text'].progress_apply(evaluate_symspell)
summaries_train['autocorrect_errors'] = summaries_train['text'].progress_apply(evaluate_autocorrect)


100%|██████████| 500/500 [02:32<00:00,  3.28it/s]
100%|██████████| 500/500 [00:26<00:00, 18.98it/s]
100%|██████████| 500/500 [00:35<00:00, 14.04it/s]
100%|██████████| 500/500 [00:44<00:00, 11.13it/s]


In [7]:
summaries_train.to_csv("../data/summaries_train_with_spellcheck_errors.csv", index=False)

In [8]:
correlations = summaries_train[['textblob_errors', 'pyspellchecker_errors', 'symspell_errors', 'autocorrect_errors', 'wording', 'content']].corr()
print(correlations[['wording', 'content']])

                        wording   content
textblob_errors        0.411795  0.574385
pyspellchecker_errors  0.487511  0.732595
symspell_errors        0.373476  0.585821
autocorrect_errors     0.240791  0.426653
wording                1.000000  0.790635
content                0.790635  1.000000


In [9]:
summaries_train['text_length'] = summaries_train['text'].apply(len)
summaries_train['normalized_textblob_errors'] = summaries_train['textblob_errors'] / summaries_train['text_length']
summaries_train['normalized_pyspellchecker_errors'] = summaries_train['pyspellchecker_errors'] / summaries_train['text_length']
summaries_train['normalized_symspell_errors'] = summaries_train['symspell_errors'] / summaries_train['text_length']
summaries_train['normalized_autocorrect_errors'] = summaries_train['autocorrect_errors'] / summaries_train['text_length']


normalized_correlations_extended = summaries_train[['normalized_textblob_errors', 'normalized_pyspellchecker_errors', 'normalized_symspell_errors', 'normalized_autocorrect_errors', 'wording', 'content']].corr()
print(normalized_correlations_extended[['wording', 'content']])



                                   wording   content
normalized_textblob_errors       -0.028231 -0.075574
normalized_pyspellchecker_errors -0.070670 -0.002601
normalized_symspell_errors        0.181957  0.321365
normalized_autocorrect_errors    -0.107153 -0.112457
wording                           1.000000  0.790635
content                           0.790635  1.000000
