### CARREGAR DADOS

In [None]:
import pandas as pd
import numpy as np

: 

In [None]:
# load reddit data

comments = pd.read_csv('raw/comments_all.csv')

submissions = pd.read_csv('raw/submissions_all.csv')

print('Comments shape:', comments.shape)
print('Submissions shape:', submissions.shape)

print ('Comments columns:', comments.columns.tolist())
print ('Submissions columns:', submissions.columns.tolist())

In [None]:
comments[['body','id']]


In [None]:
submissions[['selftext', 'id']]

### FILTRAGEM INICIAL


In [None]:
import re
import unicodedata
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
stopwords_pt = set(stopwords.words('portuguese'))

In [None]:
def tokenize(text):


    text = text.lower() # lowercase

    text = unidecode(text) # remove accents

    text = re.sub(r'[^0-9A-Za-z\s]', '', text) # remove special characters

    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces

    tokenized_text = text.split() # split into words

    tokenized_text = [word for word in tokenized_text if word not in stopwords_pt]   # remove stopwords
    
    tokenized_text = [word for word in tokenized_text if len(word) > 2] # remove short words

    return tokenized_text


print(tokenize("OLÀ, tudo bem? \n\nEu sou um texto com caracteres unicode: ñ, é, ç, ü, jão, etc."))

# convert 'body' and 'selftext' columns to string type to avoid errors during tokenization
comments['body'] = comments['body'].astype(str)
submissions['selftext'] = submissions['selftext'].astype(str)

# tokenize the 'body' column in comments and 'selftext' column in submissions
comments['tokenized_body'] = comments['body'].apply(tokenize)
submissions['tokenized_selftext'] = submissions['selftext'].apply(tokenize)
  

In [None]:
# filter comments that contain at least two of the seed words in their tokenized body
def contains_seed_words(tokenized_text, seed_words, min_count=2):
    count = sum(1 for word in tokenized_text if word in seed_words)
    return count >= min_count

In [None]:

seed_words = [
    'guerra', 'guerras','ucrania', 'ucraniano', 'ucraniana', 'ucranianos', 'ucranianas','russia', 'russa', 'russo', 'russos', 'russas',
    'conflito', 'conflitos','putin','zelensky','invasao', 'invasoes','tropa', 'tropas','nato', 'otan','sancao',
    'sancoes', 'embargo', 'embargos','bombardeio', 'bombardeios','ocupacao', 'ocupacoes','resistencia','ofensiva', 'ofensivas','defesa', 'defesas',
    'tanque', 'tanques', 'blindado', 'blindados','cessar-fogo', 'cessar_fogo','diplomacia','negociacao', 'negociacoes',
    'dialogo','paz','refugiado', 'refugiada', 'refugiados', 'refugiadas','crimeia', 'crimea','donetsk',
    'luhansk','mariupol','kiev', 'kyiv','ue', 'uniao europeia','europeu', 'europeia', 'europeus', 'europeias'
]
# seed words to filter comments


comments['contains_seed_words'] = comments['tokenized_body'].apply(lambda x: contains_seed_words(x, seed_words))
submissions['contains_seed_words'] = submissions['tokenized_selftext'].apply(lambda x: contains_seed_words(x, seed_words))

# filter submissions based on the presence of seed words
filtered_submissions = submissions[submissions['contains_seed_words']]
# filter comments based on the presence of seed words
filtered_comments = comments[comments['contains_seed_words']]

print('Filtered Submissions shape:', filtered_submissions.shape)
print('Filterad Comments shape:', filtered_comments.shape)


In [None]:
pd.set_option('display.max_colwidth', None)
filtered_submissions[['selftext']].iloc[0:2]

In [None]:
filtered_comments[['body']].iloc[0:2]  # display first 2 filtered comments

### TOP PALAVRAS MAIS FREQUENTES NA FILTRAGEM INICIAL

In [None]:
token_tf = {}

# calculate term frequency for each token in the filtered comments and submissions
for index, row in filtered_comments.iterrows():
    for token in row['tokenized_body']:
        if token not in token_tf:
            token_tf[token] = 0
        token_tf[token] += 1

for index, row in filtered_submissions.iterrows():
    for token in row['tokenized_selftext']:
        if token not in token_tf:
            token_tf[token] = 0
        token_tf[token] += 1    

# sort tokens by their term frequency
sorted_tokens = sorted(token_tf.items(), key=lambda x: x[1], reverse=True)

# top K tokens
K = 100
top__tokens = sorted_tokens[:K]

print(top__tokens)

In [None]:
new_words = [
    'pais','eua', 'mundo', 'contra', 'paises', 'governo', 'area','poder', 'estado', 'pro', 'povo', 'historia', 'historica', 'historico', 'populacao'
]

# merge seed words and new words
seed_words = set(seed_words + new_words)

# filter comments and submissions again based on the updated seed words

comments['contains_seed_words'] = comments['tokenized_body'].apply(lambda x: contains_seed_words(x, seed_words))
submissions['contains_seed_words'] = submissions['tokenized_selftext'].apply(lambda x: contains_seed_words(x, seed_words))

final_filtered_comments = comments[comments['contains_seed_words']]
final_filtered_submissions = submissions[submissions['contains_seed_words']]

print('Filtered Comments shape after merging new words:', final_filtered_comments.shape)
print('Filtered Submissions shape after merging new words:', final_filtered_submissions.shape)

### VALIDACAO

In [None]:
filtered_submissions[['selftext']].iloc[0:20]

### SAVE A CSV