# Basic Text Processing

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davideposillipo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davideposillipo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/davideposillipo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davideposillipo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk import word_tokenize, pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn

import pandas as pd

In [3]:
df = pd.read_pickle("../../data/dreams_annotated.pkl")
#df = df.sample(frac = 1)
df["report"] = df["report"].astype(str)

In [None]:
dream = df["report"].iloc[0]

In [None]:
dream

In [None]:
# lowcase and Tokenize the article: tokens
lower_tokens = word_tokenize(dream.lower())

In [None]:
print(lower_tokens)

In [None]:
# Import WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk_stopw =stopwords.words('english') 

# Retain alphabetic words: alpha_only 
alpha_only = [t for t in lower_tokens if t.isalpha()]
print('alpha_only\n',alpha_only,'\n')

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in nltk_stopw]
print('no_stops\n',no_stops,'\n')

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops] 
print('lemmatized\n',lemmatized,'\n')


## NLTK lemmatizer

In [None]:
lista = ['coming','are','usefulness','better','worst','cars']
wordnet_lemmatizer = WordNetLemmatizer()
lista_lemm = [wordnet_lemmatizer.lemmatize(l) for l in lista]
print(lista_lemm) # notice it doesn't do much

# POS (part of speech) tagging is required to improve performances
from nltk.corpus import wordnet as wn
# wordnet pos
#wn.ADJ --> 'a'
#wn.VERB --> 'v'
#wn.NOUN --> 'n'
#wn.ADV --> 'r'
# without pos
print( wordnet_lemmatizer.lemmatize('worst') )
# with pos
print( wordnet_lemmatizer.lemmatize('worst','a') )

# without pos
print( wordnet_lemmatizer.lemmatize('walking') )
# with pos
print( wordnet_lemmatizer.lemmatize('walking','v') )

# without pos
print( wordnet_lemmatizer.lemmatize('feet') ) # funziona già perché il default è 'n'
# with pos
print( wordnet_lemmatizer.lemmatize('feet','n') )

In [None]:
print( wordnet_lemmatizer.lemmatize('seriously','a') )

In [None]:
# nltk.pos_tag(): find a pos for every word

no_stops = ['cars','are','quite','worst'] # comment here if you prefer to use the data from above
print( pos_tag(no_stops) ) 

# NOTE:
# pos does not follow the 'a','v','n','r' notaion --> lemmatize 
# we need to match the 'a','v','n','r' notation

# this function matches the pos_tag() pos to the wordnet pos used by lemmatize()
def get_pos(pos):
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default  
        
no_stops_with_pos = pos_tag(no_stops)
no_stops_with_pos_ok = list(map(lambda x: ( x[0],get_pos(x[1]) ) , no_stops_with_pos))
print(no_stops_with_pos_ok,'\n')

# finally lemmatize
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized = [wordnet_lemmatizer.lemmatize(w,p) for w,p in no_stops_with_pos_ok] 
print('original\n',no_stops,'\n')
print('lemmatized\n',lemmatized,'\n')

### A custom tokenizer

In [None]:
# given a tokenized document we create a function to lemmatize the doc

no_stops = ['cars','are','quite','worst']
def get_pos(pos):
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v'
    elif pos.startswith('N'):
        return 'n'
    elif pos.startswith('R'):
        return 'r'
    else:          
        return 'n' # 'n' is the deafault for the lemmatize method 
        
def lemmatize(tokenized_doc):
    # match a pos to every token
    tokenized_doc_with_pos = pos_tag(tokenized_doc)
    
    # ensure NLTK can read the pos, create the touple (token, token_pos)
    tokenized_doc_with_pos_ok = list(map(lambda x: ( x[0],get_pos(x[1]) ) , tokenized_doc_with_pos))

    # finally lemmatize
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(w,p) for w,p in tokenized_doc_with_pos_ok]   
    return lemmatized

lemmatize(no_stops)
    

In [None]:
RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(dream)

## Preprocessing pipeline

In [None]:
def get_pos(pos):
    '''
    Convert nltk.pos_tag() tags  so that they can be understood by pos tags by nltk.WordNetLemmatizer()
    '''
    if pos.startswith('J'):
        return 'a' # o wordnet.ADJ
    elif pos.startswith('V'):
        return 'v' # o wordnet.VERB
    elif pos.startswith('N'):
        return 'n' # o wordnet.NOUN
    elif pos.startswith('R'):
        return 'r' # o wordnet.ADV
    else:          
        return 'n' # default 

def txt_preprocessing(X, printa=False):
    i = 0 #text to print
    #lowcase
    X = [x.lower() for x in X]
    if printa: print(X[i],'\n')

    # tokenize: token are made of strings or of alphanumerical strings; punctuaction and special chars are excluded.
    # token with <=2 or >14 chars are removed
    X = [RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(x) for x in X] # or [re.findall(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b',x) for x in X]
    if printa: print(X[i],'\n')

    #remove stop words
    X = [(lambda x: [x_i for x_i in x if x_i not in nltk_stopw])(x) for x in X] # alternatively list(map(lambda x: ([x_i for x_i in x if x_i not in nltk_stopw]),X))
    if printa: print(X[i],'\n')

    # lemmatization using POS
    X = [pos_tag(x) for x in X]
    if printa: print(X[i],'\n')

    # map POS tags to work with nltk.WordNetLemmatizer()
    X = [ (lambda x: [(x_i[0],get_pos(x_i[1])) for x_i in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # lemmatize
    X = [(lambda x: [WordNetLemmatizer().lemmatize(w,p) for w,p in x])(x) for x in X]
    if printa: print(X[i],'\n')

    # reshape as a list of sentences: [['this','is','string','1'], ['this','is','string','2']...] --> ['this is string 1','this is string 2'...]
    X = [" ".join(x) for x in X]
    if printa: print(X[i])

    return X

In [None]:
txt_preprocessing([dream], printa=True)

#### Applying pipeline to dataset

In [None]:
# It takes a while...
df["report_cleaned"] = df["report"].apply(lambda x: txt_preprocessing([x]))

In [None]:
df.to_pickle("../../data/dreams_annotated_cleaned.pkl")