This notebook aims at finding the most common spelling errors and annotate them. This step reduces the noise in the original dataset.

In [None]:
import pandas as pd
import re
import os
import swifter
import numpy as np
from collections import Counter, defaultdict
from spellchecker import SpellChecker
from tqdm import tqdm
import pickle

os.environ['NUMEXPR_MAX_THREADS'] = '45'

In [None]:
df = pd.read_csv('dataset_tweet.csv', sep="|")
texts = df["text"]

# Search unknown words

In [None]:
spell = SpellChecker(language="fr")

In [None]:
all_unk = []

for text in tqdm(texts):
    text = str(text)
    text = text.replace("<url>", "")
    text = text.replace("\xa0", " ")
    text = re.sub(r"(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)", '',str(text))
    text = re.sub(r'[^\w\s]',' ',text)
    unk = [x for x in spell.unknown(text.split(" ")) if x!='']
    if len(unk) > 0:
        all_unk.extend(unk)

In [None]:
most_common = Counter(all_unk).most_common()
most_common_words=[x[0] for x in most_common[:3000]]
all_unk = list(set(all_unk))
pickle.dump(most_common_words, open("most_common_words", 'wb'))

# Spelling correction dictionnary

The next cells are a quick tool to annotate the most common unknown words. Leaving the input box empty skips the word (for example, the word coronavirus exists but is unknown to SpellChecker).

In [None]:
from IPython.display import clear_output

modifs = {}

In [None]:
for key in most_common_words:
    clear_output()
    print(key)
    inp = input()
    if inp != '':
        modifs[key] = inp

In [None]:
pickle.dump(modifs, open("modifs", 'wb'))