# Text Preprocessing

## Import Libraries

In [1]:
# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

In [2]:
# Importing Libraries
import unidecode  # 1.7
import pandas as pd  # for life!
import re  # 1.5
import string  # Common string operations
import nltk  # 1.14
nltk.download('stopwords')
from nltk.tokenize import word_tokenize  # 1.12
from nltk.corpus import stopwords  # 1.12
from nltk.stem import WordNetLemmatizer # 1.14
from nltk.stem import RSLPStemmer # 1.14
nltk.download('rslp')
from autocorrect import Speller # 1.13
from bs4 import BeautifulSoup  # 1.4
# from nltk import word_tokenize
# import timeit
# import time
#from nltk.stem.api import StemmerI


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haroldinho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /Users/haroldinho/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [3]:
# Read Dataset
df = pd.read_csv('../thermofeeler/data/encoded_df.csv')
print('Number of Data points: ', df.shape[0])
print('Number of Features: ', df.shape[1])
print('features: ', df.columns.values)

# Show Dataset
df.head()

Number of Data points:  159849
Number of Features:  3
features:  ['Unnamed: 0' 'tweet_text' 'encoded_sentiment']


Unnamed: 0.1,Unnamed: 0,tweet_text,encoded_sentiment
0,0,@dilsonramoslima #Fato Acho que o Roger é um b...,0
1,1,#NOVIDADE! @LATAM_BRA acaba de anunciar novo v...,0
2,2,Quando tem #novidade😆 tem @novafm103 na área! ...,0
3,3,@RiodeNojeira #Novidade Taí o sucesso dos filh...,0
4,4,"[Livro/Novidades] Segredos, uma história de Lu...",0


In [4]:
# Drop unnamed column
df = df.drop(columns='Unnamed: 0')
df.head(3)

Unnamed: 0,tweet_text,encoded_sentiment
0,@dilsonramoslima #Fato Acho que o Roger é um b...,0
1,#NOVIDADE! @LATAM_BRA acaba de anunciar novo v...,0
2,Quando tem #novidade😆 tem @novafm103 na área! ...,0


In [5]:
# This command tells information about the attributes of Dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159849 entries, 0 to 159848
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   tweet_text         159849 non-null  object
 1   encoded_sentiment  159849 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [6]:
# Shows statistics for every numerical column in our dataset.
df.describe()

Unnamed: 0,encoded_sentiment
count,159849.0
mean,0.0
std,0.816499
min,-1.0
25%,-1.0
50%,0.0
75%,1.0
max,1.0


## Check type of Dataframe attribute that has to processed


In [7]:
# Type of attribute "tweet_text"
type(df['tweet_text'][0])

str

In [8]:
# Type of attribute "encoded_sentiment"
type(df['encoded_sentiment'][0])

numpy.int64

In [9]:
tweet = df['tweet_text'][85442]
tweet

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Remove newlines & tabs

In [10]:
# a = 'This is her \\ first day at this place.\n Please,\t Be nice to her.\\n'

In [11]:
def remove_newlines_tabs(tweet):
    """  
    This function will remove all the occurrences of newlines, tabs, and combinations like: \\n, \\.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of newlines, tabs, \\n, \\ characters.
        
    Example:
    Input : This is her \\ first day at this place.\n Please,\t Be nice to her.\\n
    Output : This is her first day at this place. Please, Be nice to her. 
    
    """
    # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
    Formatted_tweet = tweet.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    return Formatted_tweet

    

In [12]:
print(remove_newlines_tabs(tweet))
print(f'this tweet has {len(tweet)} words')

q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E
this tweet has 279 words


## Strip Html Tags 
'< >'   for scraping

In [13]:
# b = 'This is a nice place to live. <IMG>'

In [14]:
def strip_html_tags(tweet):
    """ 
    This function will remove all the occurrences of html tags from the text.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of html tags.
        
    Example:
    Input : This is a nice place to live. <IMG>
    Output : This is a nice place to live.  
    """
    # Initiating BeautifulSoup object soup.
    soup = BeautifulSoup(tweet, "html.parser")
    # Get all the text other than html tags.
    stripped_tweet = soup.get_text(separator=" ")
    return stripped_tweet

In [15]:
strip_html_tags(tweet)

'q lindo <3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Remove Links

In [16]:
# c = 'To know more about cats and food & website: catster.com  visit: https://catster.com//how-to-feed-cats'

In [17]:
def remove_links(tweet):
    """
    This function will remove all the occurrences of links.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of all types of links.
        
    Example:
    Input : To know more about cats and food & website: catster.com  visit: https://catster.com//how-to-feed-cats
    Output : To know more about cats and food & website: visit:     
    
    """
    # Removing all the occurrences of links that starts with https
    remove_https = re.sub(r'http\S+', '', tweet)  # r'\1' --> Limits all the repeatation to only one character.
    # return remove_https                         
    
    # Remove all the occurrences of text that ends with .com
    remove_com = re.sub(r"\ [A-Za-z]*\.com", ' ', remove_https)
    return remove_com

In [18]:
remove_links(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) '

## Remove WhiteSpaces

In [19]:
# d = 'How   are   you   doing   ?'

In [20]:
def remove_white_spaces(tweet):
    """ This function will remove 
        extra whitespaces from the text
        
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after extra whitespaces removed .
        
    Example:
    Input : How   are   you   doing   ?
    Output : How are you doing ?     
        
    """
    pattern = re.compile(r'\s+')  # r'\1' --> Limits all the repeatation to only one character.
    Without_whitespace = re.sub(pattern, ' ', tweet)
    # There are some instances where there is no space after '?' & ')', 
    # So I am replacing these with one space so that It will not consider two words as one token.
    tweet = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    return tweet

In [21]:
remove_white_spaces(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :)  https://t.co/M2436o8y6E'

## Remove Accented Characters

In [22]:
# e = 'Málaga, àéêöhello'

In [23]:
# Code for accented characters removal
def accented_characters_removal(tweet):
    """
    The function will remove accented characters from the 
    text contained within the Dataset.
       
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.
        
    Example:
    Input : Málaga, àéêöhello
    Output : Malaga, aeeohello    
        
    """
    # Remove accented characters from text using unidecode.
    # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
    tweet = unidecode.unidecode(tweet)
    return tweet

In [24]:
accented_characters_removal(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituicao que cuida de criancas que sofrem bullying, abuso e violencia. pegaram a historia dele e fizeram um projeto pra ajudar criancas :) https://t.co/M2436o8y6E'

## Case Conversion

In [25]:
# Code for text lowercasing
def lower_casing_text(tweet):
    """
    The function will convert text into lower case.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
         value: text in lowercase
         
    Example:
    Input : The World is Full of Surprises!
    Output : the world is full of surprises!
    
    """
    # Convert text to lower case
    # lower() - It converts all upperase letter of given string to lowercase.
    tweet = tweet.lower()
    return tweet

In [26]:
lower_casing_text(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/m2436o8y6e'

## Reduce repeated characters and punctuations

In [27]:
#f = 'Realllllllllyyyyy,        Greeeeaaaatttt   !!!!?....;;;;:)'

In [28]:
# Code for removing repeated characters and punctuations
def reducing_incorrect_char_repeatation(tweet):
    """
    This Function will reduce repeatition to two characters 
    for alphabets and to one character for punctuations.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Finally formatted text with alphabets repeating to 
        two characters & punctuations limited to one repeatition 
        
    Example:
    Input : Realllllllllyyyyy,        Greeeeaaaatttt   !!!!?....;;;;:)
    Output : Reallyy, Greeaatt !?.;:)
    
    """
    # Pattern matching for all case alphabets# Pattern matching for all case alphabets
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)  
    # Limiting all the  repeatation to two characters.
    Formatted_tweet = Pattern_alpha.sub(r"\1\1", tweet)   # r'\1\1' --> It limits all the repeatation to two characters.
    # Pattern matching for all the punctuations that can occur
    Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')  
    
    # Limiting punctuations in previously formatted string to only one.
    Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_tweet)
    
    # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.
    Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)  # {2,} --> It means to match for repeatation that occurs more than two times
    return Final_Formatted


In [29]:
reducing_incorrect_char_repeatation(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https:/t.co/M2436o8y6E'

## Expand contraction words

In [30]:
# Laura ya tiene el mapeo de las palabras de contracción en portugues! ESTE ES UN TEST EN INGLES, FUNCIONA!

In [31]:
# g = "could've doesn't hadn't"

In [32]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
}

# The code for expanding contraction words
def expand_contractions(tweet, contraction_mapping =  CONTRACTION_MAP):
    """expand shortened words to the actual form.
       e.g. don't to do not
    
       arguments:
            input_tweet: "text" of type "String".
         
       return:
            value: Text with expanded form of shorthened words.
        
       Example: 
       Input : ain't, aren't, can't, cause, can't've
       Output :  is not, are not, cannot, because, cannot have 
    
     """
    # Tokenizing text into tokens.
    list_Of_tokens = tweet.split(' ')

    # Checking for whether the given token matches with the Key & replacing word with key's value.
    
    # Check whether Word is in list_Of_tokens or not.
    for Word in list_Of_tokens: 
        # Check whether found word is in dictionary "Contraction Map" or not as a key. 
         if Word in CONTRACTION_MAP: 
                # If Word is present in both dictionary & list_Of_tokens, replace that word with the key value.
                list_Of_tokens = [item.replace(Word, CONTRACTION_MAP[Word]) for item in list_Of_tokens]
                
    # Converting list of tokens to String.
    String_Of_tokens = ' '.join(str(e) for e in list_Of_tokens) 
    return String_Of_tokens     

In [33]:
expand_contractions(tweet, contraction_mapping =  CONTRACTION_MAP)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Remove special characters  

In [34]:
# h = 'Hello, K-a-j-a-l. Thi*s is $100.05 : the payment that you will recieve! (Is this okay?)'

In [35]:
# HAY DE REPASAR
def removing_special_characters(tweet):
    """Removing all the special characters except the one that is passed within 
       the regex to match, as they have important meaning in the text provided.
   
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text with removed special characters that don't require.
        
    Example: 
    Input : Hello, K-a-j-a-l. Thi*s is $100.05 : the payment that you will recieve! (Is this okay?) 
    Output :  Hello, Kajal. This is $100.05 : the payment that you will recieve! Is this okay?
    
   """
    # The formatted text after removing not necessary punctuations.
    Formatted_tweet = re.sub(r"[^a-zA-Z$-ê]+", ' ', tweet)   #[^a-zA-Z0-9:$-,%.?!]
    # In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
    return Formatted_tweet



In [36]:
removing_special_characters(tweet)

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Remove stopwords

In [37]:
# He creado esta funcion para provar el uso del 'stopwords.words('portuguese')'

In [38]:
# from nltk.corpus import stopwords.
# stops = set(stopwords.words('english'))
# print(stops)

In [39]:
# The code for removing stopwords
stoplist = stopwords.words('portuguese') 
stoplist = set(stoplist)
def removing_stopwords(tweet):
    """This function will remove stopwords which doesn't add much meaning to a sentence 
       & they can be remove safely without comprimising meaning of the sentence.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text after omitted all stopwords.
        
    Example: 
    Input : This is Kajal from delhi who came here to study.
    Output : ["'This", 'Kajal', 'delhi', 'came', 'study', '.', "'"] 
    
   """
    # repr() function actually gives the precise information about the string
    tweet = repr(tweet)
    # Text without stopwords
    No_StopWords = [word for word in word_tokenize(tweet) if word.lower() not in stoplist ]
    # Convert list of tokens_without_stopwords to String type.
    words_string = ' '.join(No_StopWords)    
    return words_string

In [40]:
removing_stopwords(tweet)

"' q lindo & lt ; 3 fizeram zine/art 30 artistas desenhando kirishima todo dinheiro arrecadado vai pra instituição cuida crianças sofrem bullying , abuso violência . pegaram historia fizeram projeto pra ajudar crianças : ) https : //t.co/M2436o8y6E '"

In [41]:
repr(tweet)

"'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'"

In [42]:
# lets see the stop word list present in the NLTK library, without adding our custom list
portuguese_stopwords = stopwords.words('portuguese')
portuguese_stopwords

['de',
 'a',
 'o',
 'que',
 'e',
 'é',
 'do',
 'da',
 'em',
 'um',
 'para',
 'com',
 'não',
 'uma',
 'os',
 'no',
 'se',
 'na',
 'por',
 'mais',
 'as',
 'dos',
 'como',
 'mas',
 'ao',
 'ele',
 'das',
 'à',
 'seu',
 'sua',
 'ou',
 'quando',
 'muito',
 'nos',
 'já',
 'eu',
 'também',
 'só',
 'pelo',
 'pela',
 'até',
 'isso',
 'ela',
 'entre',
 'depois',
 'sem',
 'mesmo',
 'aos',
 'seus',
 'quem',
 'nas',
 'me',
 'esse',
 'eles',
 'você',
 'essa',
 'num',
 'nem',
 'suas',
 'meu',
 'às',
 'minha',
 'numa',
 'pelos',
 'elas',
 'qual',
 'nós',
 'lhe',
 'deles',
 'essas',
 'esses',
 'pelas',
 'este',
 'dele',
 'tu',
 'te',
 'vocês',
 'vos',
 'lhes',
 'meus',
 'minhas',
 'teu',
 'tua',
 'teus',
 'tuas',
 'nosso',
 'nossa',
 'nossos',
 'nossas',
 'dela',
 'delas',
 'esta',
 'estes',
 'estas',
 'aquele',
 'aquela',
 'aqueles',
 'aquelas',
 'isto',
 'aquilo',
 'estou',
 'está',
 'estamos',
 'estão',
 'estive',
 'esteve',
 'estivemos',
 'estiveram',
 'estava',
 'estávamos',
 'estavam',
 'estivera'

In [43]:
# Remove a single word
portuguese_stopwords.remove('não')

In [44]:
portuguese_stopwords

['de',
 'a',
 'o',
 'que',
 'e',
 'é',
 'do',
 'da',
 'em',
 'um',
 'para',
 'com',
 'uma',
 'os',
 'no',
 'se',
 'na',
 'por',
 'mais',
 'as',
 'dos',
 'como',
 'mas',
 'ao',
 'ele',
 'das',
 'à',
 'seu',
 'sua',
 'ou',
 'quando',
 'muito',
 'nos',
 'já',
 'eu',
 'também',
 'só',
 'pelo',
 'pela',
 'até',
 'isso',
 'ela',
 'entre',
 'depois',
 'sem',
 'mesmo',
 'aos',
 'seus',
 'quem',
 'nas',
 'me',
 'esse',
 'eles',
 'você',
 'essa',
 'num',
 'nem',
 'suas',
 'meu',
 'às',
 'minha',
 'numa',
 'pelos',
 'elas',
 'qual',
 'nós',
 'lhe',
 'deles',
 'essas',
 'esses',
 'pelas',
 'este',
 'dele',
 'tu',
 'te',
 'vocês',
 'vos',
 'lhes',
 'meus',
 'minhas',
 'teu',
 'tua',
 'teus',
 'tuas',
 'nosso',
 'nossa',
 'nossos',
 'nossas',
 'dela',
 'delas',
 'esta',
 'estes',
 'estas',
 'aquele',
 'aquela',
 'aqueles',
 'aquelas',
 'isto',
 'aquilo',
 'estou',
 'está',
 'estamos',
 'estão',
 'estive',
 'esteve',
 'estivemos',
 'estiveram',
 'estava',
 'estávamos',
 'estavam',
 'estivera',
 'esti

In [45]:
# # It returns a regular Python list
# english_stopwords = stopwords.words('english')

# # Add a single word
# english_stopwords.append('plate')
# Print the list of available languages
# print(stopwords.fileids())


In [46]:
# # Create our custom stopword list to add
# new_stopwords = ["all", "due", "to", "on", "daily"]

# # add custom list to stopword list of nltk
# stopwords = nltk.corpus.stopwords.words('english')
# stopwords.extend(new_stopwords)

# Add a list of words
# english_stopwords.extend(['food', 'meal', 'eat'])

In [47]:
# # Custom StopWords portuguese
# # stopwords: remove articles, prepositions, conjunctions etc
#     stopwords=['a','te','tu','tua','tuas','tém','um','uma','você','vocês','vos','à','às','ao','aos',
#           'aquela','aquelas','aquele','aqueles','aquilo','as','até','com','como','da','das','de',
#           'dela','delas','dele','deles','depois','do','dos','e','ela','elas','ele','eles','em',
#           'entre','essa','essas','esse','esses','esta','eu','foi','fomos','for','fora','foram',
#           'forem','formos','fosse','fossem','fui','fôramos','fôssemos', 'isso','isto','já','lhe',
#           'lhes','me','mesmo','meu','meus','minha','minhas','muito','na','nas','no','nos','nossa',
#           'nossas','nosso','nossos','num','numa','nós','o','os','para','pela','pelas','pelo','pelos',
#           'por','qual','quando','que','quem','se','seja','sejam','sejamos','sem','serei','seremos',
#           'seria','seriam','será','serão','seríamos','seu','seus','somos','sou','sua','suas','são',
#           'só','também']


## Correct mis-spelled words in text

In [48]:
# FUNCIONA MUY BIEN EN INGLES ... COMBINANDO CON EL nltk.stem.WordNetLemmatizer()
# The code for spelling corrections 
def spelling_correction(tweet):
    ''' 
    This function will correct spellings.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text after corrected spellings.
        
    Example: 
    Input : This is Oberois from Dlhi who came heree to studdy.
    Output : This is Oberoi from Delhi who came here to study.
      
    
    '''
    # Check for spellings in English language
    spell = Speller(lang='pt')
    Corrected_tweet = spell(tweet)
    return Corrected_tweet


In [49]:
spelling_correction(tweet)

'q lindo &la;3 fizeram um zune/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Lemmatization

In [50]:
# La lematización analiza el texto circundante para 
# determinar la parte del discurso de una palabra dada, no clasifica las frases.
# no funciona tan bien para nuestro caso!! solamente si optamos anadir las buscas en ingles :) 

In [51]:
tweet_en = 'textting reduced ... functionality updated'

In [52]:
# The code for lemmatization
w_tokenizer = nltk.tokenize.WhitespaceTokenizer() # Tokenize a string on whitespace (space, tab, newline)
lemmatizer = nltk.stem.WordNetLemmatizer()  # Returns the input word unchanged if it cannot be found in WordNet.
def lemmatization(tweet_en):
    """This function converts word to their root words 
       without explicitely cut down as done in stemming.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : text reduced 
    Output :  text reduce
    
   """
    # Converting words to their root forms
    lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(tweet_en)]  # 'v' = verb
    return lemma

In [53]:
lemmatization(tweet_en)

['textting', 'reduce', '...', 'functionality', 'update']

## Stemming

In [54]:
tweet = "amor amante amando amar amado amei amore amamos"

In [55]:
# The code for lemmatization
stemmer = RSLPStemmer()  # A stemmer for Portuguese
def stemmezation(tweet):
    """This function essentially chops 
       off letters from the end until the stem is reached
       it helps if the search returns variations of the word
       
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : text reduced 
    Output :  text reduce
    
   """
    # Converting words to their root forms
    for token in tweet.split():
        print(stemmer.stem(token))
    return stemmer

In [56]:
stemmezation(tweet)

am
am
am
am
am
ame
amor
am


<nltk.stem.rslp.RSLPStemmer at 0x147ee1280>

## Putting all in single function

In [57]:
# Writing main function to merge all the preprocessing steps.
def text_preprocessing(tweet, accented_chars=True, contractions=True, lemmatization = True,
                        extra_whitespace=True, newlines_tabs=True, repeatition=True, 
                       lowercase=True, punctuations=True, mis_spell=True,
                       remove_html=True, links=True,  special_chars=True,
                       stop_words=True):
    """
    This function will preprocess input text and return
    the clean text.
    """
        
    if newlines_tabs == True: # remove newlines & tabs.
        Data = remove_newlines_tabs(text)
        
    if remove_html == True: #remove html tags
        Data = strip_html_tags(Data)
        
    if links == True: #remove links
        Data = remove_links(Data)
        
    if extra_whitespace == True: #remove extra whitespaces
        Data = remove_whitespace(Data)
        
    if accented_chars == True: #remove accented characters
        Data = accented_characters_removal(Data)
        
    if lowercase == True: #convert all characters to lowercase
        Data = lower_casing_text(Data)
        
    if repeatition == True: #Reduce repeatitions   
        Data = reducing_incorrect_character_repeatation(Data)
        
    if contractions == True: #expand contractions
        Data = expand_contractions(Data)
    
    if punctuations == True: #remove punctuations
        Data = removing_special_characters(Data)
    
    stoplist = stopwords.words('english') 
    stoplist = set(stoplist)
    
    if stop_words == True: #Remove stopwords
        Data = removing_stopwords(Data)
        
    spell = Speller(lang='en')
    
    if mis_spell == True: #Check for mis-spelled words & correct them.
        Data = spelling_correction(Data)
        
    w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
     
    if lemmatization == True: #Converts words to lemma form.
        Data = lemmatization(Data)
    
           
    return Data