In [549]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np

import pandas as pd
import nltk
import re

from helpers import *

In [550]:
df = pd.read_excel(path_data + "IOW - 2 - Renamed Others.xlsx")

In [551]:
text_columns = [
    "WORD_association",
    "PERSON_association",
    "BRAND_association",   
    "SELF_association"
]

## 1 Operation: Lower and Strip

In [552]:
# lower and strip 

df[text_columns] = df[text_columns].apply(lambda x: x.str.lower().str.strip())

### 1.1 Remove: Entries with Contractions

These are likely to be sentences, hence not of interest to us

In [553]:
def remove_contractions(series):
    
    expr = "'t |'m |'re |'ve "

    invalid = series.str.contains(expr).fillna(False)

    print(series.name)
    print(series.loc[invalid].value_counts())
    print(" ")

    series.loc[invalid] = np.nan

    return series

df[text_columns] = df[text_columns].apply(remove_contractions)

WORD_association
Series([], Name: WORD_association, dtype: int64)
 
PERSON_association
reliable, dependable, won't let me down                                                                                                                                                                                                                        1
i'm  sorry, this is making my brain blank, possibly because it's not how the study was described and i can't switch my thinking. i can do impersonal word associations but making it personal seems to have exposed some sort of flaw in how my mind works.    1
i don't trust anyone                                                                                                                                                                                                                                           1
Name: PERSON_association, dtype: int64
 
BRAND_association
i don't trust brands                                                               

### 1.2 Remove: Entries with first person singular

Also typical in sentences

In [554]:
def remove_first_person(series):

    invalid = (" " + series).str.contains(" i ").fillna(False)

    print(series.name)
    print(series[invalid].value_counts())
    print(" ")

    series.loc[invalid] = np.nan

    return series

df[text_columns] = df[text_columns].apply(remove_first_person)

WORD_association
Series([], Name: WORD_association, dtype: int64)
 
PERSON_association
i trust mom                                                                                                                                                                                                                                                                                                                       1
i trust my wife. i trust her to raise our house                                                                                                                                                                                                                                                                                   1
i dont trust anyone                                                                                                                                                                                                                                                      

## 2 Operation: Replace alternative seperators with comma

In [555]:
def replace_alternate_with_comma(series):

    no_comma = ~series.str.contains(",").fillna(False)
    series.loc[no_comma] = series.loc[no_comma].str.replace(" ", ",").str.replace(";", ",")

    return series

df[text_columns] = df[text_columns].apply(replace_alternate_with_comma)

## 3 Operation: Replace special characters

In [565]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with one
    text = re.sub(r',+', ',', text) # replace multiple commas with one
    text = re.sub(r'[^a-zA-Z0-9&,:\-]', '', text) # remove special characters, allowing only & and , and : and -

    text = text.replace(",", " ").strip().replace(" ", ",") # Remove commas at the start and end of text

    return text

def clean_word(word):
    word = word.strip() # remove spaces at the start and end of word

    word = " ".join([part.strip() for part in word.split("-")]) # remove spaces between hyphenated words
    word = " ".join([part.strip() for part in word.split(":")]) # remove spaces between colonated words

    word = word.replace(" ", "-") # replace spaces with hyphens (for ngrams)

    return word

def clean(series):
    series = series.apply(clean_text)
    series = series.str.split(",").apply(lambda array: ",".join([clean_word(x) for x in array]))
    
    return series

df[text_columns] = df[text_columns].fillna("").apply(clean)

## 4 Operation: Find and fix typos

In [566]:
#pip install pyspellchecker

In [567]:
from spellchecker import SpellChecker

df_allwords = df[text_columns].fillna("")\
                .apply(lambda x: ",".join(x), axis=1)\
                    .str.split(",").explode()

spell = SpellChecker()

unknown_words = spell.unknown(df_allwords.value_counts().index)

# ngrams are marked as unknown, however the majority of errors are within unigrams, hence we will focus only on these for simplicity

unknown_unigrams = [word for word in unknown_words if len(word.split("-")) == 1] 

In [568]:
# Are there any preferred ones to fix?

df_allwords.value_counts()[unknown_unigrams]

                             134
honestkind                     1
amazonreliable                 1
bootsestablished               1
safemedicinal                  1
                            ... 
nationwidebuildingsociety      1
johnlewistrustworthy           1
judgementfree                  1
bluntandnormal                 1
ontime                         4
Length: 903, dtype: int64

In [569]:
# Note:
# 
# df_allwords.unique() list was fed to gpt-4 to get the correct spelling and copied into typos.json

In [570]:
typo_dict = read_json(path_json + "typos.json")

In [571]:
def correct_typo(series):

    def correct_typo_word(word):
        if word in typo_dict:
            return typo_dict[word]
        else:
            return word

    series = series.fillna("").apply(lambda x: apply_wordlist(x, correct_typo_word)).replace("", np.nan)
    
    return series

df[text_columns] = df[text_columns].apply(correct_typo)

In [None]:
## 5 Operation: Convert nouns to adjectives when possible (to avoid duplication)

In [572]:
# Convert some nouns to adjectives

# TODO

In [520]:
df.to_excel(path_data + "IOW - 3 - Cleaned Text.xlsx", index=False)