# Cleaning wkp_sorted files

- load files into pd dataframe
- rm headers using regexp
- use Stanza and NLTK list of stopwords to remove from the text all tokens that
   - contains a non alphabetical character OR
   - has less than 2 characters OR
   - has a postag in ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'] OR
   - is a stopword
- add filtered text to dataframe

 #### Load data into pandas dataframe

In [None]:
import pandas as pd
from sklearn.datasets import load_files
# Loading all files in "dir" directory into a pandas dataframe
DATA_DIR = "../10_clustering/wkp_sorted/"
data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace")
df = pd.DataFrame({'text': data['data'], 'label': data['target']})
df.head()

 #### Remove headers

In [None]:
import re
def rm_headers(text):
    return re.sub('=+ (\w+) =+', "", text)

no_headers = df["text"].apply(rm_headers)

X = pd.Series(no_headers,name="clean_text")
df = pd.concat([df,X],axis = 1)
df

 #### Remove tokens that  
   - contains a non alphabetical character OR
   - have less than 2 characters OR
   - have a postag in ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'] OR
   - are a stopword

In [None]:
import stanza

# Download Stanza models
stanza.download('en')

# Initialize the default English pipeline
nlp = stanza.Pipeline(lang="en", processors='tokenize,pos,lemma')

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
#stopwords.words("english")

In [None]:
def clean_up(text):
    removal=['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE']
    text_out = []
    doc= nlp(text)
    tokens = [(tok.text,tok.upos,tok.lemma) for stce in doc.sentences for tok in stce.words]
    #print(list(zip(*tokens))[0])
    for (token,upos,lemma) in tokens:
        if  token.isalpha() and len(token)>2 and upos not in removal  and token not in stopwords.words("english"):
            text_out.append(lemma)
    return " ".join(text_out)

In [None]:
lemmas = [clean_up(x) for x in df['text']]

 #### Add filtered data to pandas dataframe

In [None]:
X = pd.Series(lemmas,name="out_text")
df = pd.concat([df['text'],X],axis = 1)
df