In [None]:
from pyspark.sql import SparkSession

In [None]:
spark: SparkSession = SparkSession.builder.master('local[*]').getOrCreate()

# Import and Processing of data

In [None]:
data = spark.read.csv('./data/imdb-reviews-pt-br.csv', header=True, escape='\"', inferSchema=True)

In [None]:
print(f'Número de linhas: {data.count()} | Número de colunas: {len(data.columns)}')

In [None]:
data.printSchema()

In [None]:
data.show()

In [None]:
data.filter(data.id == 190).select('text_pt').show(truncate=False)

In [None]:
data.filter(data.id == 12427).select('text_pt').show(truncate=False)

In [None]:
data.groupBy('sentiment').count().show()

# WordCloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
sample = data.select('text_pt').sample(fraction=0.1, seed=101)

In [None]:
all = [text['text_pt'] for text in sample.collect()]

In [None]:
wordcloud = WordCloud(
  background_color='white',
  width=1920,
  height=1080,
  collocations=False,
  prefer_horizontal=1
).generate(str(all))

In [None]:
plt.figure(figsize=(20,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

# Limpeza

In [None]:
import string

In [None]:
string.punctuation

In [None]:
sample = spark.createDataFrame(
  [
    ("Oi, JP! Blz?",),
    ("$$$\\ |~ Parabéns ~| \\$$$",),
    ("(#amovc #paz&amor ^.^)",),
    ("\"bora *_* \"",),
    ("=>->'...``` vc foi selecionad@ ´´´...'<=<-",),
    ("{comprar: arroz; feijão e pepino} //",),
    ("!\"#$&'()*+,-./:;<=>?@[\]^_`{|}~",),
    ("ana@gmail.com",)
  ],
  ["texts"]
)

In [None]:
import pyspark.sql.functions as f

In [None]:
sample = sample.withColumn('text_regex', f.regexp_replace('texts', '[\$#,\"!%&\'()*+-./;:<=>?@^_`´{|}~\\\\]', ''))

In [None]:
sample = sample.withColumn('clean_text', f.trim('text_regex'))

In [None]:
sample.show(truncate=False)

In [None]:
data = data.withColumn('text_en_regex', f.regexp_replace('text_en', '[\$#,\"!%&\'()*+-./;:<=>?@^_`´{|}~\\\\]', ''))
data = data.withColumn('text_pt_regex', f.regexp_replace('text_pt', '[\$#,\"!%&\'()*+-./;:<=>?@^_`´{|}~\\\\]', ''))

In [None]:
data = data.withColumn('clean_text_en', f.trim('text_en_regex'))
data = data.withColumn('clean_text_pt', f.trim('text_pt_regex'))

In [None]:
data.limit(2).show(truncate=False)

# Tokenização

In [None]:
from pyspark.ml.feature import Tokenizer

In [None]:
tokenizer = Tokenizer(inputCol='clean_text_pt', outputCol='tokens')

In [None]:
tokenized = tokenizer.transform(data)

In [None]:
tokenized.select('clean_text_pt', 'tokens').show()

In [None]:
from pyspark.sql.types import IntegerType

countTokens = f.udf(lambda tokens: len(tokens), IntegerType())

tokenized \
  .select('clean_text_pt', 'tokens') \
  .withColumn('freq_tokens', countTokens(f.col('tokens'))) \
  .show()

# StopWords

In [None]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
stop_A = stopwords.words('portuguese')

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
remover = StopWordsRemover(inputCol='tokens', outputCol='final_text', stopWords=stop_A)

In [None]:
df = remover.transform(tokenized)

In [None]:
df.show()

In [None]:
df \
  .select('tokens', 'final_text') \
  .withColumn('freq_tokens', countTokens(f.col('tokens'))) \
  .withColumn('freq_clean_tokens', countTokens(f.col('final_text'))) \
  .show()