# Preprocessing on short text 

This notebook contains text to preprocess short text 
It will perform:
* expand contractions
* Correct spelling errors - Pyspellchecker to spell check
* Remove punctuation - NLTK 
* Lower case - NLTK
* Remove numbers - NLTK
* Remove stop words - NLTK
* Lemmatisation - NLTK Wordnet

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer

import re

from nltk.corpus import wordnet as wn
import nltk

from ekphrasis.classes.spellcorrect import SpellCorrector
from wtc_functions import load_ftc_data, expanding, clean_text
import wtc_functions as wtc



Reading english - 1grams ...


In [2]:
# !pip install nltk
# nltk.download("corpus")
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\a-lin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
stop_words = nltk.corpus.stopwords.words("english")             
lemmatiser = nltk.stem.WordNetLemmatizer()

In [4]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Loading the data 

In [None]:
datafile = "datapath" #excel file 
datadf = load_ftc_data(datafile)
datadf

In [None]:
### Load PCa data
datafile = "datapath" #excel file 

data = pd.read_excel(datafile, sheet_name=0)

In [None]:
# create a new dataframe where each row is a comment. 
# The 1st column is pateint ID, 2nd column is the free text comment and 3rd column is question number.

comments = []
indexes = []
questions = []

for i in range(7):
    col = data.columns[i+1]
    col_data = data[col].dropna()
    comments.extend(col_data)
    indexes.extend(data.loc[col_data.index, "Supplied Member Number"])
    questions.extend([str(i+1)] * len(col_data))

commentsdf = pd.DataFrame({'Patient ID': indexes, 'Comments': comments, 'Question': questions})
commentsdf = commentsdf[['Patient ID', 'Comments', 'Question']]

commentsdf

# Preprocessing
Adds processed comemnts as a new row - to look at the effect of each processing step on the comments

In [7]:
datadf = commentsdf.copy()

In [None]:
## Expand contractions    
datadf['expanded_contractions'] = wtc.expanding(datadf['Comments'])
datadf.head()

## Remove punctuations, numbers, lowercase and lemmatise

In [9]:
sp = SpellCorrector(corpus="english") 

Reading english - 1grams ...


In [None]:
def reg_words(comment):
    # replace the confidentiality filtersto maintain parts of speech tagging

    src1_str = re.compile("address removed", re.IGNORECASE)
    src2_str = re.compile("name removed", re.IGNORECASE)
    src3_str = re.compile("G.P", re.IGNORECASE)
    out = src1_str.sub("address_removed", comment)
    out = src2_str.sub("name_removed", out)
    out = src2_str.sub("GP", out)
    
    return out

stop_words = nltk.corpus.stopwords.words("english")            
lemmatiser = nltk.stem.WordNetLemmatizer()
# sp = SpellCorrector(corpus="english") 


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wn.ADJ
    elif treebank_tag.startswith("V"):
        return wn.VERB
    elif treebank_tag.startswith("N"):
        return wn.NOUN
    elif treebank_tag.startswith("R"):
        return wn.ADV
    else:
        return ""  # for easy if-statement
    
def token_text(comment):
    """input: list of comments
    output: list of text lemmatised and tokenised"""
    tok_text = []
    token_tag = nltk.pos_tag(comment)  # tags words with POS tag

    for token, tag in token_tag:
        wntag = get_wordnet_pos(tag)
#         print (wntag)
        if token == "nhs":
            lemma = "nhs"
        elif wntag == "":
            lemma = lemmatiser.lemmatize(token)
        else:
            lemma = lemmatiser.lemmatize(token, pos=wntag)
        tok_text.append(lemma)

    return tok_text

def clean_text(comment):
    """takes a single comment as input
    removes regular experession such as "address removed"
    lowercases the words
    removes punctuations, number and stopwords
    corrects spelling
    lemmatises word using wordnet
    returns list of list of lemmatised words."""
    comments = []

    row = wtc.reg_words(comment)
    row = row.lower()

    row = re.sub(r'[^\w\s]', '', row)

    words = row.split()
    
    comments = [word for word in words if word.isalpha() and word not in stop_words]
    commentt = [sp.correct(word) if sp.correct(word.lower()) != word else word for word in comments]
    cleaned_text = token_text_an(commentt)

    return commentt


Reading english - 1grams ...


In [None]:
no_stopwords = []

# comments1 = [reg_words(row) for row in datadf['comments_raw']]
comments1 = [reg_words(row) for row in datadf['Comments']]

for row in comments1:
    filtered_sentence = [w for w in row.split(" ") if not w.lower() in stop_words]
    filtered_sentence = [sp.correct(word) if sp.correct(word.lower()) != word else word for word in filtered_sentence]

    no_stopwords.append(" ".join(filtered_sentence))

datadf['stopwords_removed'] = no_stopwords


In [None]:
datadf.head()

## Nouns and adjectives only

In [None]:
def token_text_an(comment):
    """input: list of comments
    output: list of text lemmatised and tokenised"""
    tok_text = []
    token_tag = nltk.pos_tag(comment)  # tags words with POS tag

    for token, tag in token_tag:
#         print(token)
        wntag = get_wordnet_pos(tag)
#         print (wntag)
        if token == "nhs" or wntag in ['n', 'a']:
            lemma = token
        elif wntag != "":
#             print("##wntag blank##", token)
            lemma = lemmatiser.lemmatize(token, pos=wntag)
        else:
            lemma = lemmatiser.lemmatize(token)        
        tok_text.append(lemma)
#             print("token printed ##")
       
    return tok_text

# corpus_plus = ["nhs", "osteoporosis", "scolliosis", "leukaemia"]

def clean_text_an(comment):
    """takes a single comment as input
    removes regular experession such as "address removed"
    lowercases the words
    removes punctuations, number and stopwords
    corrects spelling
    lemmatises word using wordnet
    returns list of list of lemmatised words."""
    comments = []
    spelled = []

    row = wtc.reg_words(comment)
    row = row.lower()

    row = re.sub(r'[^\w\s]', '', row)

    words = row.split()
    
    comments = [word for word in words if word.isalpha() and word not in stop_words]
    commentt = [sp.correct(word) if sp.correct(word.lower()) != word else word for word in comments]
    cleaned_text = token_text_an(commentt)

    return cleaned_text

In [None]:
cleaned = [clean_text_an(row) for row in datadf['expanded_contractions']]
datadf['nouns_adj'] = cleaned
# cleaned
datadf

In [35]:
cleaned = [clean_text(row) for row in datadf['expanded_contractions']]
datadf['tokenised'] = cleaned
# cleaned