# NLP Functions

In [1]:
# Here, there are defined some methods and functions to be used later in order to pre-process texts
# and extracting useful informations.

In [2]:
import matplotlib.pyplot as plt
import demoji
import spacy 
import nltk 
import re
from wordcloud import WordCloud

In [3]:
#Downloading and installing the list of terms to be used on spacy

#Uncomment the following line to re-execute the download and installation

#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_lg

In [4]:
#!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz

In [5]:
demoji.download_codes()

  demoji.download_codes()


In [8]:
##############################################################################

############################## FIRST STEP ####################################

##############################################################################


#Function suited for removing all most common type of emoji from text
def remove_emoji(text):

    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, "")
    return text

#Removing particular sequence or special charaters using regex:

    #Carriege return and tabulation
    #Special charaters/symbols like @,?,emoji(left), other metacharaters 
    #Date/times or numbers, money, percentages.
    #Bulleted lists with particular simbols ex : o elem ,  - elem
    #Links, emails,particular sequence of charaters
    
    #The reason why i dont replace at fly the numbers is because may be part of a noum or a word like  3D, MP3, MySQl2.4 or Python3 HTML5:
    #removing the number the meaning of the word is lost.

def remove_expressions(text,debug=False):
    regex=r"\no\t|\n-|\n|\t|\[u]\S+|\u2022|\u2028|\u2605|\u25cf|\uf0b7|\u00ba|\u2122|\u270d|\u200d|\uf076|\uf0a7|\🔲|\d[.]|[$]\d+\w*|\d+[%]|\d+/\d+|\d[+]|\d+[:]\d+|[+]\d+|[@?~]|\s&\s|\u25ba|https://\S+|http://\S+|\S+@\S+|www.\S+|No. 1|<3"

    if (debug):
        matches = re.findall(regex, text)
        print(matches)

    newText=re.sub(regex, " ", text)
    return newText

#Used to remove special charaters using regex
def clean_raw_text(text,debug=False):
    text = text.lower() # all to lowercase
    
    text = remove_emoji(text) # remove all emoji from text
    text = remove_expressions(text) #use reguo
    
    text = " ".join(text.split())  # remove extra whitespace
    return text



##############################################################################

############################## SECOND STEP ###################################

##############################################################################



#Using spacy to analyze and remove additional un-useful information derived from the 
# structure of the sentences.


#Input raw text and output a pre-process text using spacy library
def process_text(text,debug=False):
    
    #nlp=spacy.blank("en")
    nlp = spacy.load("en_core_web_lg")
    
    doc = nlp(text)
    
    #Tokenization - Remove stopwords and token recognized as numbers  - Lemmatization
    newText=""
    tokens=[]
    for token in doc:
        #token_text = token.text

        if (token.is_stop==False 
            and token.pos_!="PUNCT" 
            and token.pos_!="NUM" 
            and token.pos_!="SYM" 
            and not(token.text=="." and token.pos_=="PROPN") #Special case
           ):
            newText+=token.lemma_+" "

        #token_text = token.text
        #token_pos = token.pos_
        #token_dep = token.dep_
        #token_vect= token.has_vector
        # This is for formatting only
        #print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}{token_vect}" )
   
    #stemmer = PorterStemmer()

    #stem_text = []

    #for word in doc:
    #    stem_text.append(stemmer.stem(word.text))

    #print ("\n\nSteammed text:")
    #print(stem_text)    
        
    return newText

In [9]:
#Function to call from outside      
def clean_text(text,debug=False):
     
    if (debug==True):
        print ("Original text:")
        print(text)
        
    text=clean_raw_text(text,debug)
    
    if (debug==True):
        print ("\nCleaned text:")
        print(text)
    
    text=process_text(text,debug)
    
    if (debug==True):
        print ("\nFinal text after Spacy:")
        print(text)

    return text

In [10]:
#Function to plot cloud word
def plotWordCloud(text):
    
    nlp = spacy.load("en_core_web_lg")
    
    doc=nlp(text)
    tokens=[]
    for token in doc:
        token_text = token.text
        tokens.append(token.text)
    
    word_frequency_distribution = nltk.FreqDist(tokens)
    most_common = word_frequency_distribution.most_common(20)
    print("\n")
    print(most_common)
    
    word_cloud = WordCloud(collocations=False).generate(" ".join(tokens))
    plt.figure(figsize=(10,10))
    plt.imshow(word_cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
