# Trabalho de conclusão de curso
## Comparativo de análise de sentimentos em posts do Twitter/Reddit relacionados à Stocks

# Parte 2 - Pré-Processamento

## Preparando ambiente

In [None]:
# Helpers

!pip install timely --quiet
!pip install contractions --quiet

  Building wheel for timely (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 327kB 5.8MB/s 
[K     |████████████████████████████████| 266kB 29.0MB/s 
[?25h  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone


In [None]:
import math
import pandas as pd

from timely import Stopwatch
from datetime import datetime, timedelta

<Figure size 1728x1152 with 0 Axes>

In [None]:
# from google.colab import drive

# drive.mount('/content/drive')

Mounted at /content/drive


## Pré-processamento dos dados

In [None]:
dfDados = pd.read_csv('tsla_2019_data.csv', index_col = 0)

### Básico

In [None]:
import re
import unicodedata
from bs4 import BeautifulSoup

In [None]:
'''Remove especiais do twitter'''
def remove_special_twitter(text):
  text = re.sub(r'@[A-Za-z0-9_]+', '', text) # Remove mentions
  text = re.sub(r'#', '', text) # Remove hashtags
  text = re.sub(r'RT : ', '', text) # Remove retweets
  return text

'''Remove caracteres especiais'''
def remove_special_characters(text, remove_digits=False):
  special_char_pattern = re.compile(r'([{.(-)!}])')
  text = special_char_pattern.sub(" \\1 ", text)

  pattern = r'[^a-zA-z0-9\s]'
  text = re.sub(pattern, '', text)
  return text

'''Remove acentos'''
def remove_accent(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

'''Remove links'''
def strip_html_tags(text):
  text = re.sub(r"http[s]?://\S+", "", text)
  text = re.sub(r"\s+", " ", text)
  soup = BeautifulSoup(text, "html.parser")
  stripped_text = soup.get_text()
  return stripped_text


### Stop Words

In [None]:
import nltk
import spacy

from contractions import contractions_dict
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
nlp = spacy.load('en')

nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
'''Remove stopwords'''
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

'''Lematização'''
def lemmatize(text):
  text = nlp(text)
  text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
  return text

'''Stemming'''
def stemmer(text):
  ps = nltk.porter.PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

'''Expande contrações'''
def expand_contractions(text, contraction_mapping=contractions_dict):
  contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE | re.DOTALL)

  def expand_match(contraction):
    match = contraction.group(0)
    first_char = match[0]
    expanded_contraction = contraction_mapping.get(match) \
      if contraction_mapping.get(match) \
      else contraction_mapping.get(match.lower())
    expanded_contraction = first_char + expanded_contraction[1:]
    return expanded_contraction

  try:
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
  except:
    return text
  return expanded_text


### Limpeza completa

In [None]:
def clean(textlist):
  cleanedTextList = []
  for doc in textlist:
    doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc) # remove linhas em branco
    doc = re.sub(' +', ' ', doc) # remove linhas em branco

    doc = remove_special_twitter(doc) # remove especiais do twitter
    doc = strip_html_tags(doc) # remove links
    doc = remove_accent(doc) # remove acentuação
    doc = remove_special_characters(doc) # remove caracteres especiais 

    doc = remove_stopwords(doc) # remove stopwords
    doc = expand_contractions(doc) # expande contrações
    # doc = stemmer(doc) # stemming
    doc = lemmatize(doc) # lemmatização

    doc = doc.lower() # texto em caixa baixa
    doc = doc.strip() # remove espaços em branco iniciais e finais
    doc = re.sub(r'\s+', ' ', doc) # remove espaços em branco desnecessarios

    cleanedTextList.append(doc)
  return cleanedTextList

In [None]:
def dropRows(df):
  # Remove duplicadas
  df = df.drop_duplicates(subset = 'CleanText', keep = 'last')

  # Remove textos em branco
  df = df[df['CleanText'].str.strip().astype(bool)]

  return df

In [None]:
def cleanAll(df):
  with Stopwatch() as s:
    df['CleanText'] = clean(df['Text'])
    df = dropRows(df)
  print(f'Took {s.duration()}\n')

  print(f'{df.shape}\n')

In [None]:
cleanAll(dfDados)
dfDados.head()

Took 6747.939933

(590063, 5)



Unnamed: 0,Created At,Name,Text,Source,CleanText
0,2019-01-01 23:59:21,alexandrosM,Inspired by thoughts and info I read via @Valu...,Twitter,inspired thought info read via many tsla bull ...
1,2019-01-01 23:58:56,Trumpery45,@ShortingIsFun The road stress simulator at 36...,Twitter,road stress simulator 3625 thing beauty tesla ...
2,2019-01-01 23:58:20,certifiedlink_,@JTSEO9 tsla should have reported a going conc...,Twitter,tsla report go concern get jippe haha
3,2019-01-01 23:57:43,smartertrader,Tsla. Very simple trade. If they 352 gets rippy.,Twitter,tsla simple trade 352 get rippy
4,2019-01-01 23:57:20,ShortingIsFun,@kzdorman @Tesla @elonmusk What are people doi...,Twitter,people kill time total like 1 5 hour combine w...


In [None]:
dfDados.to_csv('tsla_2019_clean.csv')