# Text Preprocessing

## Import Libraries

In [1]:
# for auto-reloading external modules
%load_ext autoreload
%autoreload 2

In [2]:
# Importing Libraries
import warnings  # for life!
warnings.filterwarnings('ignore')
import unidecode  # 1.7
import pandas as pd  # for life!
import re  # 1.5
import string  # Common string operations
import nltk  # 1.14
nltk.download('stopwords')
from nltk.tokenize import word_tokenize  # 1.12 - 1.19
from nltk.corpus import stopwords  # 1.12
from nltk.stem import WordNetLemmatizer # 1.14
from nltk.stem import RSLPStemmer # 1.14
nltk.download('rslp')
from autocorrect import Speller # 1.13
from bs4 import BeautifulSoup  # 1.4
import stanza  # 1.15
stanza.download('pt')  # 1.15
# import time
#from nltk.stem.api import StemmerI


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haroldinho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /Users/haroldinho/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-03-24 04:07:16 INFO: Downloading default packages for language: pt (Portuguese)...
2022-03-24 04:07:17 INFO: File exists: /Users/haroldinho/stanza_resources/pt/default.zip.
2022-03-24 04:07:18 INFO: Finished downloading models and saved to /Users/haroldinho/stanza_resources.


## Read Dataset

In [3]:
# Reading Dataset
df = pd.read_csv('../thermofeeler/data/encoded_df.csv')
print('Number of Data points: ', df.shape[0])
print('Number of Features: ', df.shape[1])
print('features: ', df.columns.values)

# Show Dataset
df.head()

Number of Data points:  159849
Number of Features:  3
features:  ['Unnamed: 0' 'tweet_text' 'encoded_sentiment']


Unnamed: 0.1,Unnamed: 0,tweet_text,encoded_sentiment
0,0,@dilsonramoslima #Fato Acho que o Roger é um b...,0
1,1,#NOVIDADE! @LATAM_BRA acaba de anunciar novo v...,0
2,2,Quando tem #novidade😆 tem @novafm103 na área! ...,0
3,3,@RiodeNojeira #Novidade Taí o sucesso dos filh...,0
4,4,"[Livro/Novidades] Segredos, uma história de Lu...",0


In [4]:
# Drop unnamed column
df = df.drop(columns='Unnamed: 0')
df.head(3)

Unnamed: 0,tweet_text,encoded_sentiment
0,@dilsonramoslima #Fato Acho que o Roger é um b...,0
1,#NOVIDADE! @LATAM_BRA acaba de anunciar novo v...,0
2,Quando tem #novidade😆 tem @novafm103 na área! ...,0


In [5]:
# This command tells information about the attributes of Dataset.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159849 entries, 0 to 159848
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   tweet_text         159849 non-null  object
 1   encoded_sentiment  159849 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ MB


In [6]:
# Shows statistics for every numerical column in our dataset.
df.describe()

Unnamed: 0,encoded_sentiment
count,159849.0
mean,0.0
std,0.816499
min,-1.0
25%,-1.0
50%,0.0
75%,1.0
max,1.0


## Check type of Dataframe attribute that has to processed


In [7]:
# Type of attribute "tweet_text"
type(df['tweet_text'][0])

str

In [8]:
# Type of attribute "encoded_sentiment"
type(df['encoded_sentiment'][0])

numpy.int64

In [9]:
tweet = df['tweet_text'][85442]
tweet

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

## Remove newlines & tabs

In [10]:
def remove_newlines_tabs(tweet):
    """  
    This function will remove all the occurrences of newlines, tabs, and combinations like: \\n, \\.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of newlines, tabs, \\n, \\ characters.
        
    Example:
    Input : This is her \\ first day at this place.\n Please,\t Be nice to her.\\n
    Output : This is her first day at this place. Please, Be nice to her. 
    
    """
    # Replacing all the occurrences of \n,\\n,\t,\\ with a space.
    Formatted_tweet = tweet.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
    return Formatted_tweet

    

In [11]:
print(remove_newlines_tabs('This is her \\ first day at this place.\n Please,\t Be nice to her.\\n'))


This is her   first day at this place.  Please,  Be nice to her. 


## Strip Html Tags 

In [12]:
# DESACTIVATE - We are using def Remove special characters
# remove_html=False
def strip_html_tags(tweet):  # '< >' for scraping
    """ 
    This function will remove all the occurrences of html tags from the text.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of html tags.
        
    Example:
    Input : This is a nice place to live. <IMG>
    Output : This is a nice place to live. 
    
    """
    # Initiating BeautifulSoup object soup.
    soup = BeautifulSoup(tweet, "html.parser")
    # Get all the text other than html tags.
    stripped_tweet = soup.get_text(separator=" ")
    return stripped_tweet

In [13]:
strip_html_tags('This is a nice place to live. <IMG>') 

'This is a nice place to live. '

## Remove Links

In [14]:
def remove_links(tweet):
    """
    This function will remove all the occurrences of links.
    
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after removal of all types of links.
        
    Example:
    Input : To know more about cats and food & website: catster.com  visit: https://catster.com//how-to-feed-cats
    Output : To know more about cats and food & website: visit:     
    
    """
    # Removing all the occurrences of links that starts with https
    remove_https = re.sub(r'http\S+', '', tweet)  # r'\1' --> Limits all the repeatation to only one character.                       
    
    # Remove all the occurrences of text that ends with .com
    remove_com = re.sub(r"\ [A-Za-z]*\.com", ' ', remove_https)
    return remove_com

In [15]:
remove_links('To know more about cats and food & website: catster.com  visit: https://catster.com//how-to-feed-cats')

'To know more about cats and food & website:   visit: '

## Remove WhiteSpaces

In [16]:
def remove_whitespaces(tweet):
    """ 
    This function will remove 
    extra white spaces from the text
        
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" after extra whitespaces removed .
        
    Example:
    Input : How   are   you   doing     ?
    Output : How are you doing ?     
        
    """
    pattern = re.compile(r'\s+')  # r'\1' --> Limits all the repeatation to only one character.
    Without_whitespace = re.sub(pattern, ' ', tweet)
    # There are some instances where there is no space after '?' & ')', 
    # So I am replacing these with one space so that It will not consider two words as one token.
    tweet = Without_whitespace.replace('?', ' ? ').replace(')', ') ')
    return tweet

In [17]:
remove_whitespaces('How   are   you   doing     ?')

'How are you doing  ? '

## Remove numbers

In [18]:
def remove_number(tweet):
    """ 
    This function will remove 
    all numbers from the text
        
    arguments:
        input_tweet: "text" of type "String" with fell numbers. 
                    
    return:
        value: "text" after removal of all numbers.
        
    Example:
    Input : Hello my id. is 76483927 and my phone number is 67384902
    Output : Hello my id. is   and my phone number is  
    
    """
    tweet = re.sub(r'[0-9]+',' ',tweet)
    return tweet

In [19]:
remove_number('Hello my id. is 76483927 and my phone number is 67384902')

'Hello my id. is   and my phone number is  '

## Remove Accented Characters

In [20]:
# Code for accented characters removal
def accented_characters_removal(tweet):
    """
    The function will remove accented characters from the 
    text contained within the Dataset.
       
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.
        
    Example:
    Input : Málaga, àéêöhello
    Output : Malaga, aeeohello    
        
    """
    # Remove accented characters from text using unidecode.
    # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
    tweet = unidecode.unidecode(tweet)
    return tweet

In [21]:
accented_characters_removal('Málaga, àéêöhello')

'Malaga, aeeohello'

## Case Conversion

In [22]:
# Code for text lowercasing
def lower_casing_text(tweet):
    """
    The function will convert text into lower case.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
         value: text in lowercase
         
    Example:
    Input : The World Is Full Of Surprises!
    Output : the world is full of surprises!
    
    """
    # Convert text to lower case
    # lower() - It converts all upperase letter of given string to lowercase.
    tweet = tweet.lower()
    return tweet

In [23]:
lower_casing_text('The World Is Full Of Surprises!')

'the world is full of surprises!'

## Reduce repeated characters 

In [24]:
# Code for removing repeated characters and punctuations
def reducing_error_char_repeatation(tweet):
    """
    This Function will reduce repeatition to two characters 
    for alphabets.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Finally formatted text with alphabets repeating to 
        two characters & punctuations limited to one repeatition 
        
    Example:
    Input : Realllllllllyyyyy,        Greeeeaaaatttt   !!!!?....;;;;:)
    Output : Reallyy, Greeaatt !?.;:)
    
    """
    # Pattern matching for all case alphabets# Pattern matching for all case alphabets
    Pattern_alpha = re.compile(r"([A-Za-z])\1{1,}", re.DOTALL)  
    # Limiting all the  repeatation to two characters.
    Formatted_tweet = Pattern_alpha.sub(r"\1\1", tweet)   # r'\1\1' --> It limits all the repeatation to two characters.
    
    # This Function will reduce repeatition to one character for punctuations.
    # Pattern matching for all the punctuations that can occur
    # Pattern_Punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')  # [.,/#!$%^&*?;:{}=_`~()+-]
    
    # Limiting punctuations in previously formatted string to only one.
    # Combined_Formatted = Pattern_Punct.sub(r'\1', Formatted_tweet)
    
    # The below statement is replacing repeatation of spaces that occur more than two times with that of one occurrence.
    # Final_Formatted = re.sub(' {2,}',' ', Combined_Formatted)  # {2,} --> It means to match for repeatation that 
    #                                                                       occurs more than two times
    return Formatted_tweet


In [25]:
reducing_error_char_repeatation('Realllllllllyyyyy,        Greeeeaaaatttt   !!!!?....;;;;:)')

'Reallyy,        Greeaatt   !!!!?....;;;;:)'

## Remove punctuations

In [26]:
def remove_punctuation(tweet):
    """
    The function will remove all punctuaction from the 
    text contained within the Dataset.
       
    arguments:
        input_tweet: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.
        
    Example:
    Input : good morning! buen dia,\.:|;!?`~+
    Output : good morning buen dia   
        
    """
    tweet = re.sub(r'[^\w\s]',' ',tweet)  # remove punctuation
    return tweet

In [27]:
remove_punctuation('good morning!buen dia,\.:|;!?`~+')

'good morning buen dia           '

## Expand contraction words

In [28]:
# English Word Mapping

# CONTRACTION_MAP_EN = {
# "ain't": "is not",
# "aren't": "are not",
# "can't": "cannot",
# "can't've": "cannot have",
# "'cause": "because",
# "could've": "could have",
# "couldn't": "could not",
# "couldn't've": "could not have",
# "didn't": "did not",
# "doesn't": "does not",
# "don't": "do not",
# "hadn't": "had not",
# "hadn't've": "had not have",
# "hasn't": "has not",
# "haven't": "have not",
# "he'd": "he would",
# "he'd've": "he would have",
# "he'll": "he will",
# "he'll've": "he he will have",
# "he's": "he is",
# "how'd": "how did",
# "how'd'y": "how do you",
# "how'll": "how will",
# "how's": "how is",
# "i'd": "i would",
# "i'd've": "i would have",
# "i'll": "i will",
# "i'll've": "i will have",
# "i'm": "i am",
# "i've": "i have",
# "isn't": "is not",
# "it'd": "it would",
# "it'd've": "it would have",
# "it'll": "it will",
# "it'll've": "it will have",
# "it's": "it is",
# "let's": "let us",
# "ma'am": "madam",
# "mayn't": "may not",
# "might've": "might have",
# "mightn't": "might not",
# "mightn't've": "might not have",
# "must've": "must have",
# "mustn't": "must not",
# "mustn't've": "must not have",
# "needn't": "need not",
# "needn't've": "need not have",
# "o'clock": "of the clock",
# "oughtn't": "ought not",
# "oughtn't've": "ought not have",
# "shan't": "shall not",
# "sha'n't": "shall not",
# "shan't've": "shall not have",
# "she'd": "she would",
# "she'd've": "she would have",
# "she'll": "she will",
# "she'll've": "she will have",
# "she's": "she is",
# "should've": "should have",
# "shouldn't": "should not",
# "shouldn't've": "should not have",
# "so've": "so have",
# "so's": "so as",
# "that'd": "that would",
# "that'd've": "that would have",
# "that's": "that is",
# "there'd": "there would",
# "there'd've": "there would have",
# "there's": "there is",
# "they'd": "they would",
# "they'd've": "they would have",
# "they'll": "they will",
# "they'll've": "they will have",
# "they're": "they are",
# "they've": "they have",
# "to've": "to have",
# "wasn't": "was not",
# "we'd": "we would",
# "we'd've": "we would have",
# "we'll": "we will",
# "we'll've": "we will have",
# "we're": "we are",
# "we've": "we have",
# "weren't": "were not",
# "what'll": "what will",
# "what'll've": "what will have",
# "what're": "what are",
# "what's": "what is",
# "what've": "what have",
# "when's": "when is",
# "when've": "when have",
# "where'd": "where did",
# "where's": "where is",
# "where've": "where have",
# "who'll": "who will",
# "who'll've": "who will have",
# "who's": "who is",
# "who've": "who have",
# "why's": "why is",
# "why've": "why have",
# "will've": "will have",
# "won't": "will not",
# "won't've": "will not have",
# "would've": "would have",
# "wouldn't": "would not",
# "wouldn't've": "would not have",
# "y'all": "you all",
# "y'all'd": "you all would",
# "y'all'd've": "you all would have",
# "y'all're": "you all are",
# "y'all've": "you all have",
# "you'd": "you would",
# "you'd've": "you would have",
# "you'll": "you will",
# "you'll've": "you will have",
# "you're": "you are",
# "you've": "you have",
# }

# Portuguese Word Mapping
CONTRACTION_MAP_PT = {'é':'ser','eh':'ser','vc':'voce','vcs':'voces','tb': 'tambem','tbm': 'tambem', 
            'obg': 'obrigado','obrigada':'obrigado','gnt': 'gente', 'q': 'que', 'n': 'nao', 
            'cmg': 'comigo', 'p':'para','pra' :'para','ta': 'está','tá':'está','to': 'estou', 
            'vdd':'verdade','bjos':'beijo','bjo':'beijo','kd': 'cade', 'pq':'porque',
            'cmg':'comigo','cm':'com','pc':'ca','aq':'aqui','qdo':'quando','p':'para','':'que','agr':'agora'}


# The code for expanding contraction words   
def expand_contractions(tweet):  #, contraction_mapping =  CONTRACTION_MAP_PT):
    """
    expand shortened words to the actual form.
    e.g. don't  to  do not
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
         value: Text with expanded form of shorthened words.
        
    Example: 
    Input : Vamos pra praia cmg qdo vc sair do trabalho, bjs e obg
    Output : Vamos para praia comigo quando voce sair do trabalho, bjs e obrigado 
  
    """
# Tokenizing text into tokens.
    list_Of_tokens = tweet.split(' ')
    for word in list_Of_tokens:
        if word in CONTRACTION_MAP_PT.keys():
            word_value = CONTRACTION_MAP_PT[word]
            list_Of_tokens[list_Of_tokens.index(word)] = word_value
            
#   Converting list of tokens to String.
    String_Of_tokens = ' '.join(i for i in list_Of_tokens) 
    return String_Of_tokens        

In [29]:
expand_contractions('Vamos pra praia cmg qdo vc sair do trabalho, bjs e obg')

'Vamos para praia comigo quando voce sair do trabalho, bjs e obrigado'

## Remove special characters  

In [30]:
def removing_special_characters(tweet):
    """
    Removing all the special characters except the one that is passed within 
    the regex to match, as they have important meaning in the text provided.
    Also remove all html tags, hashtags but keepimg the word after (#covid)
   
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text with removed special characters that don't require.
        
    Example: 
    Input : <IMG> K-a-j-a-l. #COVID #NBA Thi*s is $100.05 : @BRASIL @Barcelona recieve! (Is this okay?) 
    Output :  Hello, Kajal. This is $100.05 : the payment that you will recieve! Is this okay?
    
    """
    # The formatted text after removing not necessary punctuations.
    Formatted_tweet = re.sub(r'@[A-Za-z0-9_]+','',tweet)  # remove @mentions  #[^a-zA-Z0-9:$-,%.?!]
    Formatted_tweet = re.sub(r'#',' ',Formatted_tweet)  # remove hashtags   
    Formatted_tweet = re.sub(r"<[A-Za-z0-9_]+>",'',tweet)  # remove tag's
    # In the above regex expression,I am providing necessary set of punctuations that are frequent in this particular dataset.
    return Formatted_tweet



In [31]:
removing_special_characters('<IMG> K-a-j-a-l. #COVID #NBA Thi*s is $100.05 : @BRASIL @Barcelona recie<IMG>ve! (Is this okay?)')

' K-a-j-a-l. #COVID #NBA Thi*s is $100.05 : @BRASIL @Barcelona recieve! (Is this okay?)'

## Remove stopwords

In [32]:
# How to print Stopwords
# from nltk.corpus import stopwords.
# stops = set(stopwords.words('portuguese'))
# print(stops)

In [33]:
# The code for removing stopwords
def removing_stopwords(tweet):
    """
    This function will remove stopwords which doesn't add much meaning to a sentence 
    & they can be remove safely without comprimising meaning of the sentence.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text after omitted all stopwords.
        
    Example: 
    Input : hoje estou Barcelona está todos estamos voces estão agente ontem estive
    Output : hoje Barcelona todos voces agente ontem 
    
    """
    tweet = ' '.join([word for word in tweet.split() if word not in (stopwords.words('portuguese'))])
    return tweet

In [34]:
removing_stopwords('hoje estou Barcelona está todos estamos voces estão agente ontem estive')
   

'hoje Barcelona todos voces agente ontem'

### Dealing with stopwords... 

In [35]:
# lets see the stop word list present in the NLTK library, without adding our custom list
# Print the list of available languages
# print(stopwords.fileids())

# add custom list to stopword list of nltk
# stopwords = nltk.corpus.stopwords.words('english')
# stopwords.extend(new_stopwords)

# Add a list of words
# english_stopwords.extend(['food', 'meal', 'eat'])

# english_stopwords = stopwords.words('english')
portuguese_stopwords = stopwords.words('portuguese')
portuguese_stopwords

['de',
 'a',
 'o',
 'que',
 'e',
 'é',
 'do',
 'da',
 'em',
 'um',
 'para',
 'com',
 'não',
 'uma',
 'os',
 'no',
 'se',
 'na',
 'por',
 'mais',
 'as',
 'dos',
 'como',
 'mas',
 'ao',
 'ele',
 'das',
 'à',
 'seu',
 'sua',
 'ou',
 'quando',
 'muito',
 'nos',
 'já',
 'eu',
 'também',
 'só',
 'pelo',
 'pela',
 'até',
 'isso',
 'ela',
 'entre',
 'depois',
 'sem',
 'mesmo',
 'aos',
 'seus',
 'quem',
 'nas',
 'me',
 'esse',
 'eles',
 'você',
 'essa',
 'num',
 'nem',
 'suas',
 'meu',
 'às',
 'minha',
 'numa',
 'pelos',
 'elas',
 'qual',
 'nós',
 'lhe',
 'deles',
 'essas',
 'esses',
 'pelas',
 'este',
 'dele',
 'tu',
 'te',
 'vocês',
 'vos',
 'lhes',
 'meus',
 'minhas',
 'teu',
 'tua',
 'teus',
 'tuas',
 'nosso',
 'nossa',
 'nossos',
 'nossas',
 'dela',
 'delas',
 'esta',
 'estes',
 'estas',
 'aquele',
 'aquela',
 'aqueles',
 'aquelas',
 'isto',
 'aquilo',
 'estou',
 'está',
 'estamos',
 'estão',
 'estive',
 'esteve',
 'estivemos',
 'estiveram',
 'estava',
 'estávamos',
 'estavam',
 'estivera'

In [36]:
# Remove a single word
portuguese_stopwords.remove('não')

In [37]:
# Check if it works...
'não'in portuguese_stopwords

False

In [38]:
# Add a single word
portuguese_stopwords.append('tua')

In [39]:
# Check if it works...
'tua'in portuguese_stopwords

True

In [40]:
# Create our custom stopword list to add
# Custom StopWords portuguese

our_stopwords=['a','ah','g','h', 'd','ca','te','tu','tua','tuas','um','uma','voce','voces','vos', 'la','lo','lá',
               'as','ao','aos','aquela','aquelas','aquele','aqueles','aquilo','as','ate','com','como','da','das',
               'de','dela','delas','dele','deles','depois','do','dos','e','ela','elas','ele','eles','em','entre',
               'essa','essas','esse','esses','eu','for','isso','isto','já','lhe','lhes','me','mesmo','meu','meus',
               'minha','minhas','muito','na','nas','no','nos','nossa','nossas','nosso','nossos','num','numa',
               'nós','oh','o','os','para','pela','pelas','pelo','pelos','por','qual','quando','que','quem',
               'se','sem','seu','seus','somos','sou','sua','suas','so','tambem', 'mas','ou', 'nem',
               'este','teu','teus','estes','estas','agora','ai','alem','algo','alguém','algum','ainda',
               'alguma','algumas','alguns', 'ali','ampla','amplas', 'amplo', 'amplos','ante', 'antes','apenas',
               'apoio','após','aqui','aquilo','assim','atrás','através','bastante','breve','cada', 'cedo', 'cento',
               'certamente','certeza','cima','coisa','coisas','da','dao','daquela', 'daquelas','daquele',
               'daqueles','dentro','contudo','debaixo','demais','depois','desde','dessa','dessas','desse','desses',
               'desta','destas','deste','destes','embora','enquanto','entre','etc','feita','feitas','feito',
               'feitos','for','fora','geral','grande','grandes','hoje', 'hora', 'horas', 'longe',
               'lugar', 'maior','maioria','mais','meio', 'menor', 'menos', 'mes', 'meses','mesma', 'mesmas',
               'mesmo', 'mesmos','muita', 'muitas','muito','muitos','naquela', 'naquelas', 'naquele', 'naqueles',
               'nessa', 'nessas', 'nesse', 'nesses', 'nesta', 'nestas', 'neste', 'nestes','num', 'numa','onde',
               'ontem','perto','parte','outra', 'outras', 'outro', 'outros', 'pois', 'porém', 'porque',
               'possivel', 'possivelmente','pouca', 'poucas', 'pouco', 'poucos', 'primeira', 'primeiras',
               'primeiro', 'primeiros','propria','proprias','proprio', 'proprios', 'proxima', 'proximas',
               'proximo', 'proximos','quais', 'quanto', 'quantos','quem','sempre','si', 'sido','sob', 'sobre',
               'tal', 'talvez','tampouco', 'tanta', 'tantas','tanto', 'tao', 'tarde', 'te', 'todo', 'todos',
               'toda', 'todas','tudo', 'ultima', 'ultimas', 'ultimo', 'ultimos','vários','vez', 'vezes',]



## Correct mis-spelled words in text

In [41]:
# FUNCIONA MUY BIEN EN INGLES ... COMBINANDO CON EL nltk.stem.WordNetLemmatizer()
# The code for spelling corrections 
def spelling_correction(tweet):
    ''' 
    This function will correct spellings.
    
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text after corrected spellings.
        
    Example: 
    Input : voc e eu naum gostanos de brencar na rua sin salda
    Output : você e eu num gostamos de brincar na rua se saia
    
    '''
    # Check for spellings in Portuguese language
    spell = Speller(lang='pt')  # English = 'en'
    Corrected_tweet = spell(tweet)
    return Corrected_tweet


In [42]:
spelling_correction('voc e eu naum gostanos de brencar na rrua sm saia')

'você e eu num gostamos de brincar na rua se saia'

## Lemmatization

In [43]:
# La lematización analiza el texto circundante para 
# determinar la parte del discurso de una palabra dada, no clasifica las frases.

In [44]:
nlp = stanza.Pipeline('pt')  # set the language for Portuguese (stanza)
# Stanza it is built with highly accurate neural network components that enable efficient 
# training and evaluation with your own annotated data,

# The code for lemmatization
def lemmatization(tweet):
    """
    This function converts word to their root words 
    without explicitely cut down as done in stemming.
   
    arguments:
        input_tweet: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : brincando treino cantei jogarei subindo agredido
    Output : 'brincar treinar cantar jogar subir agredir '
    
    """
    lemma = ""
    for sent in nlp(tweet).sentences:
        for word in sent.words:
            lemma += word.lemma + " "
    return lemma

2022-03-24 04:07:19 INFO: Loading these models for language: pt (Portuguese):
| Processor | Package |
-----------------------
| tokenize  | bosque  |
| mwt       | bosque  |
| pos       | bosque  |
| lemma     | bosque  |
| depparse  | bosque  |

2022-03-24 04:07:19 INFO: Use device: cpu
2022-03-24 04:07:19 INFO: Loading: tokenize
2022-03-24 04:07:19 INFO: Loading: mwt
2022-03-24 04:07:19 INFO: Loading: pos
2022-03-24 04:07:19 INFO: Loading: lemma
2022-03-24 04:07:19 INFO: Loading: depparse
2022-03-24 04:07:20 INFO: Done loading processors!


In [45]:
lemmatization('brincando treinou cantei jogarei subindo agredido')

'brincar treinar cantar jogar subir agredir '

## Stemming

In [46]:
# DESACTIVATE - We are using just Lemmatization for the moment...
# stemmezation = False
# The code for stemmezation
stemmer = RSLPStemmer()  # A stemmer for Portuguese
def stemmezation(tweet):
    """
    This function essentially chops 
    off letters from the end until the stem is reached
    it helps if the search returns variations of the word
       
    arguments:
         input_tweet: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : amor amante amando amar amado amei amore amamos amarei amo 
    Output : am  am     am     am   am    ame  amor  am     am     amo
    
   """
    # Converting words to their root forms
    for token in tweet.split():
        print(stemmer.stem(token))
    return stemmer

In [47]:
stemmezation('amor amante amando amar amado amei amore amamos amarei amo')

am
am
am
am
am
ame
amor
am
am
amo


<nltk.stem.rslp.RSLPStemmer at 0x14e91b1f0>

In [48]:
def demo():
    from nltk import stem
    stemmer = stem.RSLPStemmer() 

    # white-space tokenizer friendly text

    tweet_token = tweet.split()
  
    for token in tweet_token:
        word = token
        print(stemmer.stem(token))
    
        return stemmer

In [49]:
demo()

q


<nltk.stem.rslp.RSLPStemmer at 0x14e91b0d0>

## Putting all in single function

In [50]:
# Writing main function to merge all the preprocessing steps.

def text_preprocessing(tweet, lowercase=True, links=True, remove_html=False, 
                       numbers=True, special_chars=True, repeatition=True, 
                       newlines_tabs=True, punctuation=True, extra_whitespace=True, 
                       contractions=True, mis_spell=True, stop_words=True, 
                       lemmatization_word=True, stemmezation = False, 
                       accented_chars=True):
    """
    This function will preprocess input text and return
    the clean text.
    """
        
    if lowercase == True:  # convert all characters to lowercase.
         tweet = lower_casing_text(tweet)
            
    if links == True:  # remove links.
        tweet = remove_links(tweet) 
        
    if remove_html == True: # remove html tags   # **DESACTIVATE
        Data = strip_html_tags(Data)

    if numbers == True:  # remove all numbers.
        tweet = remove_number(tweet)
    
    if special_chars == True:  # remove all special characters.
        tweet = removing_special_characters(tweet)
        
    if repeatition == True:  # reduce repeatitions.   
        tweet = reducing_error_char_repeatation(tweet)
        
    if newlines_tabs == True:  # remove newlines & tabs.
        tweet = remove_newlines_tabs(tweet)
        
    if punctuation == True:  # remove punctuation.   
        tweet = remove_punctuation(tweet)
        
    if extra_whitespace == True:  # remove extra whitespaces.
        tweet = remove_whitespaces(tweet)
        
    if contractions == True: # expand contractions.
        tweet = expand_contractions(tweet)
        
    spell = Speller(lang='pt')
    
    if mis_spell == True: # check for mis-spelled words & correct them.
        tweet = spelling_correction(tweet)
        
    stoplist = stopwords.words('portuguese') 
    stoplist = set(stoplist)
    
    if stop_words == True:  # remove stopwords.
        tweet = removing_stopwords(tweet)
        
    if lemmatization_word == True:  # converts words to lemma form.
        tweet = lemmatization(tweet)  
                
    if accented_chars == True:  # remove accented characters.
        tweet = accented_characters_removal(tweet)
        
    word_tokens = word_tokenize(tweet) # tokenize tweet.

#     stemmer = RSLPStemmer()      # **DESACTIVATE
#   if stemmezation_word == True:  # converts words to stemmer form.  
#         df = stemmezation(df)
    
           
    return word_tokens

In [51]:
tweet

'q lindo &lt;3 fizeram um zine/art com 30 artistas desenhando o kirishima e todo o dinheiro arrecadado vai pra uma instituição que cuida de crianças que sofrem bullying, abuso e violência. pegaram a historia dele e fizeram um projeto pra ajudar crianças :) https://t.co/M2436o8y6E'

In [52]:
text_preprocessing(tweet)

['lir',
 'ela',
 'fazer',
 'zune',
 'art',
 'artista',
 'desenhar',
 'kirishima',
 'todo',
 'dinheiro',
 'arrecadar',
 'ir',
 'instituicao',
 'cuidar',
 'crianca',
 'sofrer',
 'bullying',
 'abuso',
 'violencia',
 'pegar',
 'historia',
 'fazer',
 'projeto',
 'ajudar',
 'crianca']

## To be continue ..

In [53]:
# Pre-processing for Content

# List_Content = df['tweet_text'].to_list()

# Final_Article = []
# Complete_Content = []

# for article in List_Content:
#     Processed_Content = text_preprocessing(article) # Cleaned text of Content attribute after pre-processing
#     Final_Article.append(Processed_Content)

# Complete_Content.extend(Final_Article)
# df['Processed_Content'] = Complete_Content

# # Pre-processing for Title

# List_Title = df['Title'].to_list()

# Final_Title = []
# Complete_Title = []

# for title in List_Title:
#     Processed_Title = text_preprocessing(title) # Cleaned text of Title attribute after pre-processing
#     Final_Title.append(Processed_Title)

# Complete_Title.extend(Final_Title)
# df['Processed_Title'] = Complete_Title 


In [54]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()


print(X.shape)


(4, 9)
