In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=options)
driver.get("https://x.com/login")
time.sleep(5)

try:
    username = driver.find_element(By.NAME, "text")
    username.send_keys(os.getenv('USER'))
    username.send_keys(Keys.RETURN)
    time.sleep(3)
    password = driver.find_element(By.NAME, "password")
    password.send_keys(os.getenv('PASSWORD'))
    password.send_keys(Keys.RETURN)
    time.sleep(5)
    print("Login realizado com sucesso")

    query = "SUA QUERY"
    driver.get(f"https://x.com/search?q={query}&src=typed_query&f=live")
    wait = WebDriverWait(driver, 10)
    tweets = []
    users = []
    seen_tweets = set()
    scroll_attempts = 0
    max_scroll_attempts = 10

    while scroll_attempts < max_scroll_attempts:
        elements = wait.until(EC.visibility_of_all_elements_located(
            (By.XPATH, "//*[@data-testid='tweetText'] | //*[@data-testid='User-Name']")
        ))
        current_user = None

        for el in elements:
            if el.get_attribute("data-testid") == "User-Name":
                text = el.text.strip()
                match = re.search(r"(.+)\n(@\w+)", text)

                if match:
                    name = match.group(1).strip()
                    at = match.group(2).strip()
                    current_user = (name, at)

            elif el.get_attribute("data-testid") == "tweetText":
                tweet_text = el.text.strip()

                if query.lower() in tweet_text.lower() and current_user and tweet_text not in seen_tweets:
                    users.append(current_user)
                    tweets.append(tweet_text)
                    seen_tweets.add(tweet_text)
                    current_user = None

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        scroll_attempts += 1
        print(f'{scroll_attempts} / {max_scroll_attempts}')

    df = pd.DataFrame(users, columns=['Name', 'At'])
    df['Tweet'] = tweets 
    df.to_csv('../data/tweets.csv', index=False)
    print(df.head())

except Exception as e:
    print("Erro ao buscar tweets:", e)
driver.quit()

In [None]:
nltk.download('stopwords')
stopwords_pt = set(stopwords.words('portuguese'))

internet_stopwords = [
    "q", "pq", "p", "pra", "pro", "ta", "tá", "né", "blz", "vlw", "obg", "obrigado",  
    "obrigada", "td", "tudo", "c", "vc", "vcs", "tb", "tbm", "tbem", "aki", "aqui",  
    "kd", "cadê", "d", "de", "hj", "hoje", "amanha", "dps", "depois", "s", "sim", "n", "nao",  
    "não", "mt", "muito", "mta", "muita", "mto", "muitos", "muitas", "eh", "é", "da", "das",  
    "do", "dos", "aq", "aí", "aí", "blz", "beleza", "flw", "falou", "tipo", "msm", "mesmo",  
    "mesma", "qm", "quem", "qlq", "qualquer", "aff", "pff", "pfv", "porfavor", "favor",  
    "cmg", "comigo", "ctg", "contigo", "ngm", "ninguém", "qnd", "quando", "tb", "também",  
    "bora", "partiu", "sei", "sabe", "sabe", "sabe", "sqn", "so", "só", "vamo", "vamos",  
    "bjs", "beijos", "bj", "beijo", "bjo", "blza", "beleza", "eh", "é", "ow", "ou",  
    "axei", "achei", "pq", "porque", "qdo", "quando", "nem", "po", "pô", "mano", "véi",
    "véio", "nn", "vdd", "vo", "ia", "mds", "vi", "vem", "qq", "ai", "cada", "ver", "onde", 
    "amg", "tô", "bom", "dá", "ja", "fiz", "uns", "um", "lá", "ir", "vai", "ter", "fazer", "gente",
    "oq", "bem", "agr", "faço", "logo", "tao", "tão", "vejo", "nessa", "nesta", "indo", "tanto", "fica",
    "voce"
]

df = pd.read_csv('../data/tweets.csv')

def clean_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords_pt and word not in internet_stopwords])
    return text

df['Clean_Tweet'] = df['Tweet'].apply(clean_tweet)
text = ' '.join(df['Clean_Tweet'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()