Importações

In [2]:
import pandas as pd
import re
from pt_lemmatizer import Lemmatizer
import spacy
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 42
nlp = spacy.load("pt_core_news_sm")

Função de Detetcção de Linguagens utilizando langdetect

In [3]:

def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

Funções de Tratamento de Textos para Tweets

In [4]:
punctuation ="‘!”$%&\’()*+,-./:;<=>?[\\]^_`{|}~•@’"


def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

  tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet


Usamos os Spacy para dividir em Tokens

In [5]:
def tokenize(texto):
    """Tokeniza, remove stopwords e lematiza um texto em português"""
    doc = nlp(texto)
    return [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and token.is_alpha and len(token.text) > 2
    ]

Exemplo de Uso

In [6]:
print(tokenize("Os ratos roeram a roupa do rei de Roma."))
# Saída exemplo: ['rato', 'roer', 'roupa', 'rei', 'Roma']

['rato', 'roer', 'roupa', 'rei', 'roma']


Funções de PreProcessamento de Tweets

In [7]:
def preprocess_tweet(tweet):
    """Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet_token_list = tokenize(tweet)  # apply lemmatization and tokenization
    tweet = ' '.join(tweet_token_list)
    return tweet

def basic_clean(tweet):
    """Main master function to clean tweets only without tokenization or removal of stopwords"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet = re.sub('📝 …', '', tweet)
    return tweet

  tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
  tweet = re.sub('\s+', ' ', tweet)  # remove double spacing


Aplicação no Dataframe de Tweets

In [8]:
def clean_tweets(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    """

    df['clean_tweet'] = df.Content.apply(basic_clean)
    num_tweets = len(df)
    print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
    return df



def tokenize_tweets(df):
    """Main function to read in and return cleaned and preprocessed dataframe.
    This can be used in Jupyter notebooks by importing this module and calling the tokenize_tweets() function
    Args:
        df = data frame object to apply cleaning to
    Returns:
        pandas data frame with cleaned tokens
    """

    df['tokens'] = df.Content.apply(preprocess_tweet)
    num_tweets = len(df)
    print('Complete. Number of Tweets that have been cleaned and tokenized : {}'.format(num_tweets))
    return df

Execução do Código

In [9]:
caminho_arquivo = r'C:\Users\Pessoal\Documents\Unifesp\Aries\aries_topic_selector\dados\dadosTwitter\CSV\tweets amoxilina - Página1.csv'
tweets_df = pd.read_csv(caminho_arquivo)
tweets_df.dropna(axis='columns', inplace=True)
print(tweets_df.columns)

#Escolhendo as colunas relevantes
tweets_df = tweets_df[['Tweet ID', 'Content', 'Date']]


#Só por precaução, removendo duplicatas
tweets_df.drop_duplicates(inplace=True, subset="Tweet ID")
tweets_df.drop_duplicates(inplace=True, subset="Content")

#print(tweets_df)


Index(['Tweet ID', 'URL', 'Content', 'Likes', 'Retweets', 'Replies', 'Quotes',
       'Views', 'Date'],
      dtype='object')


Agora, separamos os tweets por Idiomas utilizando a Biblioteca LangDetect

In [10]:
tweets_df = clean_tweets(tweets_df)
tweets_df["language"] = tweets_df["Content"].apply(detect_language)

#Criando dataframes separados por idioma

df_espanhol = tweets_df[tweets_df["language"] == "es"]
df_ingles = tweets_df[tweets_df["language"] == "en"]
df_portugues = tweets_df[~tweets_df["language"].isin(["es", "en"])]

df_espanhol.to_excel("tweets_processados_espanhol.xlsx", index=False)
df_ingles.to_excel("tweets_processados_ingles.xlsx", index=False)
df_portugues.to_excel("tweets_processados_portugues.xlsx", index=False)


Complete. Number of Tweets that have been cleaned and tokenized : 202


Agora, aplicas o Tokenizer no datframe em Portugues, que é o nosso foco

In [11]:
df_portugues = tokenize_tweets(df_portugues)

#Removendo tweets com menos de 6 palavras
df_portugues["word_count"] = df_portugues["tokens"].apply(lambda x: len(x.split()))
df_portugues = df_portugues[df_portugues["word_count"] >= 6]


df_portugues.to_excel("tweets_processados.xlsx", index=False)
print("Arquivo 'tweets_processados.xlsx' salvo com sucesso!")

Complete. Number of Tweets that have been cleaned and tokenized : 187
Arquivo 'tweets_processados.xlsx' salvo com sucesso!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokens'] = df.Content.apply(preprocess_tweet)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_portugues["word_count"] = df_portugues["tokens"].apply(lambda x: len(x.split()))
