<a href="https://colab.research.google.com/github/Arteric-Jeff-Knight/collabs/blob/master/text_cleaner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run the first code block once to load libraries and define functions.  

Ignore any output that isn't an error. Process takes a minute or so, but is finished when you see:

<font color="green">✔ Download and installation successful</font><br>
You can now load the model via spacy.load('en_core_web_sm')


In [None]:
#@title <= Run first one time

# Install needed libraries
!pip install emoji contractions num2words

# First the easy stuff.
from google.colab import files
import ipywidgets as widgets
import io
import re, string, unicodedata  # Import Regex, string and unicodedata.
import numpy as np
import pandas as pd  # Import pandas.
import spacy
from datetime import datetime
import emoji
import contractions  # Import contractions library.
from num2words import num2words

# We are only using spacy to lemmatize the content and to get stop words
spacy.cli.download('en_core_web_sm')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

compiled_ordinals = re.compile(r"\d+(?:st|nd|rd|th)")
compiled_acronyms = re.compile(r"(?<!\w)([A-Za-z])\.")
compiled_dashes_to_to = re.compile('([0-9])-([0-9])')
compiled_dashes_to_minus = re.compile(' -([0-9])')
compiled_hyphens_to_spaces = re.compile('([A-Za-z])-([A-Za-z])')
compiled_slashes_to_divided_by = re.compile('([0-9])/([0-9])')
compiled_m_d_y = re.compile(r'\d+/\d+/\d+')
compiled_just_one_space = re.compile(r"\s\s+")


# Define the normalization function
def normalize(content: str, configs: dict) -> str:
    # Set default configs if they are missing
    if 'normalize_expand_symbols' not in configs:
        configs['normalize_expand_symbols'] = '%=@'
    if 'normalize_remove_punctuation' not in configs:
        configs['normalize_remove_punctuation'] = '[^a-zA-Z0-9 ]'

    # Convert emojis
    if 'normalize_convert_emojis' not in configs or configs['normalize_convert_emojis']:
        content = emoji.demojize(content)
    # Remove non-ASCII
    if 'normalize_to_ascii' not in configs or configs['normalize_to_ascii']:
        content = unicodedata.normalize('NFKD', content).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Expand contractions
    if 'normalize_expand_contractions' not in configs or configs['normalize_expand_contractions']:
        content = contractions.fix(content)
        content = content.replace(" w/ ", " with ")
        # @todo more expansions like afk?
    # remove URLs
    if 'normalize_remove_urls' not in configs or configs['normalize_remove_urls']:
        content = re.sub(r"http\S+", "", content)
    # Remove leading RT_
    if 'normalize_remove_retweet' not in configs or configs['normalize_remove_retweet']:
        content = re.sub("^[Rr][Tt] ", "", content)
        # Remove leading @somename
    if 'normalize_remove_tweeter' not in configs or configs['normalize_remove_tweeter']:
        content = re.sub("^@[^ ]*", "", content)
    # If requested, remove all usernames
    if 'normalize_remove_usernames' not in configs or configs['normalize_remove_usernames']:
        content = re.sub("@[^ ]*", " ", content)
    # If requested, expand dates like m/d/y and m/d to strings
    if 'normalize_expand_dates' not in configs or configs['normalize_expand_dates']:
        # it is impossible to tell whether 1/3 is a fraction or a date, so we won't even try
        for date in set(compiled_m_d_y.findall(content)):  # only need to do once
            try:
                dto = datetime.strptime(date, '%d/%m/%Y')
                day = dto.strftime('%-d')
                do = num2words(day, to="ordinal")
                fulldate = dto.strftime(f'%B {do} %Y')
                content = content.replace(date, fulldate)
            except Exception as e:
                try:
                    dto = datetime.strptime(date, '%d/%m/%y')
                    day = dto.strftime('%-d')
                    do = num2words(day, to="ordinal")
                    fulldate = dto.strftime(f'%B {do} %Y')
                    content = content.replace(date, fulldate)
                except Exception as e:
                    pass
    # Expand some symbols
    if 'normalize_expand_symbols' in configs and configs['normalize_expand_symbols']:
        if '%' in configs['normalize_expand_symbols']:
            content = content.replace("%", " percent ")
        if '=' in configs['normalize_expand_symbols']:
            content = content.replace("=", " equals ")
        if '@' in configs['normalize_expand_symbols']:
            content = content.replace("@", " at ")
        # @todo more symbols
    # Replace all the ordinals
    if 'normalize_replace_ordinals' not in configs or configs['normalize_replace_ordinals']:
        ordinals = compiled_ordinals.findall(content)  # find all in string
        if ordinals:
            ordinals = set(ordinals)  # reduce to unique values
            for ordinal in ordinals:
                content = content.replace(ordinal, num2words(ordinal[:-2], to="ordinal") + ' ')
    # Convert to lower case, then get rid of punctuation
    if 'normalize_convert_to_lower' not in configs or configs['normalize_convert_to_lower']:
        content = content.lower()
        # Get rid of remaining punctuation
    if 'normalize_remove_punctuation' in configs and configs['normalize_remove_punctuation']:
        # if they aren't preserved, get rid of the periods in acronymns 
        # (so they don't get changed to spaces)
        if '.' not in configs['normalize_remove_punctuation']:
            content = compiled_acronyms.sub(r'\1', content)
        # update all hyphens between words to spaces 
        content = compiled_hyphens_to_spaces.sub(r'\1 \2', content)
        # and all the number-number to number to number
        content = compiled_dashes_to_to.sub(r'\1 to \2', content)
        # any remaining space dash number is probably a minus sign
        content = compiled_dashes_to_minus.sub(r'minus \1', content)
        # get rid of anything left over
        content = re.sub(configs['normalize_remove_punctuation'], " ", content)
    # Replace numbers with words
    if 'normalize_numbers_to_words' not in configs or configs['normalize_numbers_to_words']:
        # Consider just splitting on the space to KISS
        words = []
        for word in content.split(' '):
            try:
                if word.isnumeric():
                    if int(word) < 999:
                        word = num2words(word)
            except Exception as e:
                pass
            words.append(word)
        # put everything back together
        content = ' '.join(words)
    # Reduce multiple spaces
    content = compiled_just_one_space.sub(" ", content).strip()
    return content


def lemmatize(sentence: str):
    keepers = []
    for token in nlp(sentence):
      if token.lemma_[0] != '-':
        keepers.append(token.lemma_)
      else:
        keepers.append(token.text)
    sentence = " ".join(keepers)
    return sentence.replace(' - ','-')


def split_df_into_data_and_configs(uploaded, defaults=None, config_name: str = 'config'):
    # Put uploaded file into dataframe
    if defaults is None:
        defaults = {}

    filename = list(uploaded.keys())[0]
    df = pd.read_csv(io.BytesIO(uploaded[filename]), header=None)

    # Get everything with 'configs' in first coumn
    configs = df[df[0] == config_name]
    # Build a dictionary from the key in the second column with values from the third
    defaults.update(dict(zip(configs[1], configs[2])))

    # Everything else that isn't a config, is data
    data = df[df[0] != config_name].reset_index(drop=True)
    # Assume that the first row is the column names now that configs are gone
    data.columns = data.iloc[0]
    # Drop the row with the column names
    data.drop(df.index[0], inplace=True)
    # Reset the index, so zero works below
    data = data.reset_index(drop=True)

    # Validate the column name configs
    data_column_list = list(data.columns)

    if 'normalize_col_in' not in defaults or defaults['normalize_col_in'] not in data_column_list:
        # With nothing defined or garbage, use first column
        if defaults['normalize_col_in'].capitalize() in data_column_list:
            defaults['normalize_col_in'] = defaults['normalize_col_in'].capitalize()
        else:
            defaults['normalize_col_in'] = data_column_list[0]

    if 'normalize_col_out' not in defaults:
        defaults['normalize_col_out'] = 'clean_text'

    if 'normalize_col_lem' not in defaults:
        defaults['normalize_col_lem'] = 'lemmatized'

    if 'normalize_file_out' not in defaults:
        defaults['normalize_file_out'] = '-cleaned'

    if 'normalize_drop_dupes' not in defaults:
        defaults['normalize_drop_dupes'] = True

    if not isinstance(defaults['normalize_drop_dupes'], bool) \
            and defaults['normalize_drop_dupes'].lower() in ['false', '0', 0]:
        defaults['normalize_drop_dupes'] = False

    if 'normalize_output_filename' not in defaults:
        defaults['normalize_output_filename'] = filename.replace('.csv', f"{defaults['normalize_file_out']}.csv")

    return data, defaults




# Next, run the following code block to upload a file for processing

If you want to process multiple files, start here for each one, no need to run the first block again and again.

Processing is mostly handled by default values, but if you need to override them, configuration is handled by passing values in the input file. See **How to Configure** below for instructions.

The code will ask you to choose a file to import and then it will download the results when it is finished.

In [None]:
# @title <= Run once for every file to clean

defaults = {
    'normalize_col_in': 'content',
    'normalize_col_out': 'clean_text',
    'normalize_col_lem': 'lemmatized',
    'normalize_file_out': '-cleaned',
    'normalize_drop_dupes': True
}

df, configs = split_df_into_data_and_configs(files.upload(), defaults)

print('---------- configs ----------')
for key in configs:
    print('   ', key, ':', configs[key])
print('-----------------------------')
display(df)

# Remove duplicate rows
num_rows = df.shape[0]
if configs['normalize_drop_dupes']:
    df.drop_duplicates(configs['normalize_col_in'], inplace=True)
    print(f'Processing {df.shape[0]} rows after deleting {num_rows - df.shape[0]} duplicates:\n')
else:
    print(f'Processing {df.shape[0]} rows:\n')

# Apply the normalization function to the input file
df[configs['normalize_col_out']] = df.apply(lambda x: normalize(x[configs['normalize_col_in']], configs), axis=1)
df[configs['normalize_col_lem']] = df.apply(lambda x: lemmatize(x[configs['normalize_col_out']]), axis=1)

print('Finished Processing\n')

print(f"Saving locally to {defaults['normalize_output_filename']}\n")
# This code downloads the result to your local machine.
df.to_csv(configs['normalize_output_filename'], index=False)
files.download(configs['normalize_output_filename'])


In [None]:
#@title <= Optional: for debugging, list 10 random results
for idx in np.random.choice(df.shape[0], replace = True, size = 5):
    print('----------')
    print(df.loc[idx][configs['normalize_col_in']])
    print('-')
    print(df.loc[idx][configs['normalize_col_out']])
    print('-')
    print(df.loc[idx][configs['normalize_col_lem']])
    print()


# **About the Normalization Function**

### Here we define the steps to normalize the text:

Normalization is done in a specific order to try and resolve all the predictable cases before most of the punctuation is removed. 

- If `normalize_convert_emojis`, convert all emojis to alias string equivalents
    - e.g. 👍  becomes `:thumbs_up:`
    - Subsequent punctuation removal changes `:thumbs_up:` to "thumbs up"
- If `normalize_to_ascii`, convert Unicode to ASCII and then back again
    - This removes all emojis and accents and other garbage
    - Converts back to unicode because later operations expect it
- if `normalize_expand_contractions`, expand contractions
    - For consistent grammar, expand "it's" to "it is", etc.
    - Future removal of punctation would change contractions to nonsense
- If `normalize_remove_urls`, Remove URLs
    - They are not words
- If `normalize_remove_retweet`, Remove 'RT ' from the start
    - Many tweets begin with "RT " for retweet
    - Even if string is not a tweet, "RT " is nonsense and can safely be eliminated.
- If `normalize_remove_tweeter`, Remove '@username: ' from start of string
    - Even tweets that aren't retweets begin with "@username: " which is garbage
- If `normalize_remove_usernames`, Remove all usernames beggining with '@'
    - If not removed they are handled like proper nouns, but punctuation removal can cause unintended consequences
- If `normalize_expand_dates`, change m/d/y style dates to strings
    - Month is converted to name
    - Date is converted to orinal value (e.g. "2" becomes "second")
    - Year remains a number but is padded to four digits
- Replace any symbols found in `normalize_expand_symbols` with their word equivalend.
    - "%" becomes "percent"
    - "=" becomes "equals"
    - "@" becomes "at"
    - Dash and Minus are impossible to differentiate
- If `normalize_replace_ordinals`, change ordinals to strings
  - e.g. "2nd" becomes "second"
- If `normalize_convert_to_lower`, convert all letters to lower case
  - Prevents system from considering "Word" and "word" as two different words
- If `normalize_remove_punctuation` is `False` do nothing, otherwise value is a regular expression of characters to **keep**
    - Defaults to `[^a-zA-Z0-9 ]`, anything that isn't a letter or a number is replaced with a space. 
    - Keep all numbers and letters (capitalization may be preserved)
    - Changing this configuration requires an understanding of [Regular Expressions](https://docs.python.org/3/howto/regex.html)
- If `normalize_numbers_to_words`, changes all numbers less than one thousand to their English equivalent
  - e.g. "42" becomes forty-two
    - Note this may reintroduce the hyphen!
- Change all spaces to single space and remove all leading and trialing spaces
  - Always necessary, no need to configure


# **How to Configure**

To add configurations to a file, put the value 'config' in the first column (no matter what the header) and the configuration key name in the second, and the value to be set in the third (anything beyond that can be ignored). Durding processing these rows will be separated from the data and not included in the returned file. For convenience, they can appear anywhere in the incoming file: before the headers, at the end, anywhere in between or even mixed among the data.

The possible configuration keys and their default values are:

| Key | Defualt | Notes |
|--------------|:-----------|:------|
| `normalize_col_in` | Content | *The exact, case sensitive name of column in the incoming file to process* |
| `normalize_col_out` | clean_text | *The name of column to add the processed content* |
| `normalize_col_lem` | lemmatized | *The name of column to add the lemmatized processed content* |
| `normalize_file_out` | -cleaned | *The text to add to the filename that is returned* |
| `normalize_output_filename` | | If passed, this value overrides the value calulated by applyting `file_out` to the uploaded filename |
| `normalize_drop_dupes` | `True` | *If this is set to True, then rows that duplicate content are dropped* | 
| `normalize_convert_emojis` | `True` | *Converts emojis to word strings, if `False`, `normalize_to_ascii` eliminates them entirely* |
| `normalize_to_ascii` | `True` | *Reduces unicode to ASCII or eliminates character where no conversion is possible* |
| `normalize_expand_contractions` | `True` | *Changes all contractions to long form* |
| `normalize_remove_urls` | `True` | *Removes URLs* |
| `normalize_remove_retweet` | `True` | *Eliminates any leading 'RT ' characters* |
| `normalize_remove_tweeter` | `True` | *Removes the first @\<username> string* |
| `normalize_remove_usernames` | `True` | *Removes all @\<username> strings* |
| `normalize_expand_dates` | `True` | *Conerts m/d/y to month and day names and numeric year * |
| `normalize_expand_symbols` | %=@ | *`False` or empty to do nothing, otherwise list of symbols to expand to words, e.g. % to 'percent'* |
| `normalize_replace_ordinals` | `True` | *Changes ordinals to words, e.g. 2nd to 'second'* |
| `normalize_convert_to_lower` | `True` | *Converts all text to lower case* |
| `normalize_remove_punctuation` | [^a-zA-Z0-9 ] | *Removes remaining punctuation* |
| `normalize_numbers_to_words` | `True` | *Changes numbers less than one thousand to text, e.g. 27 to 'twenty-seven'* |

- If the value in `normalize_col_in` does not match any header in the uploaded file, the content in the first column will be processed.
- If the value in `normalize_col_out` or `normalize_col_lem` matches an existing column, it will be overwritten.
- The value in `normalize_file_out` will be inserted between the base filename and the '.csv' of the uploaded file to create `normalize_output_filename` unless a value is passed as a configuration.


# **Sample Configuration to Copy and Paste**

No need to add them all to a file, just the ones you want to change:

```
normalize_col_in,content
normalize_col_out,clean_text
normalize_col_lem,lemmatized
normalize_file_out,-cleaned
normalize_output_filename,custom_filename.csv
normalize_drop_dupes,True
normalize_convert_emojis,True
normalize_to_ascii,True
normalize_expand_contractions,True
normalize_remove_urls,True
normalize_remove_retweet,True
normalize_remove_tweeter,True
normalize_remove_usernames,True
normalize_expand_dates,True
normalize_expand_symbols,"%=@"
normalize_replace_ordinals,True
normalize_convert_to_lower,True
normalize_remove_punctuation,"[^a-zA-Z0-9 ]"
normalize_numbers_to_words,True
```
