# Import libraries

In [1]:
import os
import sys
import tweepy
import requests
from dotenv import load_dotenv
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sys.path.append('../../')
import config

load_dotenv(os.path.join(config.PROJ_PATH, '.env')) 

True

# Set twitter access

In [2]:
def set_twitter_access():
  consumer_key = os.environ["API_KEY"]
  consumer_secret = os.environ["API_KEY_SECRET"]
  access_token = os.environ["ACCESS_TOKEN"]
  access_token_secret = os.environ["ACCESS_TOKEN_SECRET"]

  auth = tweepy.OAuth1UserHandler(
    consumer_key, 
    consumer_secret, 
    access_token, 
    access_token_secret
  )

  api = tweepy.API(auth)

  return api

api = set_twitter_access()


# Create functions to read and save texts and images

In [3]:
def save_tweet_text(tweet:object, text_type='') -> None:
    tweet_timestamp = str(tweet.created_at.strftime("%Y-%m-%d %H-%M-%S"))
    tweet_id = str(tweet.id)
    txt_file_name = os.path.join(config.DATA_PATH_RAW_TEXTS, '{}_{}.txt'.format(tweet_timestamp, tweet_id))
    
    with open(txt_file_name, 'w', encoding="utf-8") as txt_file:
        try:
            if text_type == 'retweet':
                txt_file.write(tweet.retweeted_status.full_text)
            else:
                txt_file.write(tweet.full_text)
            # txt_file.write(tweet.retweeted_status.full_text)
        except AttributeError:
            print('Unable to read text from tweet {}'.format(tweet.id))
            print("=====")

    return None

def DownloadFile(url:str, path_to_save:str) -> None:
    response = requests.get(url)

    if response.status_code == 200:
        with open(path_to_save, 'wb') as f:
                f.write(response.content)
    return None


def save_tweet_image(tweet:object) -> None:
    tweet_timestamp = str(tweet.created_at.strftime("%Y-%m-%d %H-%M-%S"))
    tweet_id = str(tweet.id)

    try:
        for media in tweet.entities.get("media",[{}]):
            #checks if there is any media-entity
            if media.get("type",None) == "photo":
                tweet_media_id = str(media['id'])
                filename = os.path.join(config.DATA_PATH_RAW_IMAGES, '{}_{}_{}.png'.format(tweet_timestamp, tweet_id, tweet_media_id))
                DownloadFile(media["media_url"], filename)
    except AttributeError:
        print('Unable to read medias from tweet {}'.format(tweet_id))
        print("=====")

# Pull tweets

In [4]:
phrases_related_to_vaw = ['que ficar em casa', 'mulher obediente', '√© divorciada', 'Ningu√©m vai acreditar em voc√™', 
    'N√£o presta nem pra cozinha', 'Mal sabe lavar uma roupa direito', 'burra', 'interesseira']

In [5]:
def extract_tweets(to_extract, place:str=''):
    query = ""
    if place:
        query += f'place:{place} AND '

    if type(to_extract) is list:
        keywords_query = ' OR '.join(to_extract)
        query += f'({keywords_query})'
    else:
        query += to_extract
    
    print(query)
    tweets_pages = []
    for status in tweepy.Cursor(api.search_tweets,
                                query, 
                                tweet_mode='extended', 
                                lang='pt', 
                                count=3).pages(3):
        tweets_pages.append(status)

    # Read tweets
    for page in tweets_pages:
        for tweet in page:
            save_tweet_text(tweet)
            print(tweet.full_text)

            save_tweet_image(tweet)

            if tweet.coordinates is not None:
                print(tweet.coordinates)
                print(tweet.geo)
                print(tweet.contributors)
                break

extract_tweets(['Interesseira', 'vaca'])

(Interesseira OR vaca)
@acervoglobo Sabe que que √© engra√ßado? √â que a Nicole bahls colocou o nome da vaca dela de Camila Queiroz, pq ela gosta muito da Camila
RT @luscas: passando mal com os atores da globo recriando o meme da nicole bahls batizando a vaca de camila queiroz 

https://t.co/ktP4ZFMH‚Ä¶
@celiamatta T√° cert√≠ssima!
Eu acho que tem que doar ao m√°ximo... 
Mas vc t√° com cara que vai mandar merreca pra ele. M√£o de vaca!
@bsjeweIed @ferrwzz @kurozakiiiii apertei sem querer vaca
@TheAndyMelo nhaaa, se mudar de ideia apare√ßa 
tamo com saudade, vaca
n√£o vou dar amei na sua foto mais n√£o sua puta intergal√°tica mini pizza interesseira cachorra puta cachorra
@SamuelDias762 tu para, so pega sanguessuga interesseira kkkkkkkkkk
Cansada de quem acha que td mundo tem que comer que nem uma vaca, caso contr√°rio t√° passando fome
ah vai toma no cu vai sua vagabunda vadia fdp desgra√ßada ot√°ria sem amor invejosa puta vaca fudida rid√≠cula sua opini√£o n√£o me importa n√£o


In [6]:
tweets_pages = []
for status in tweepy.Cursor(api.search_tweets,
                            'place:"new york city" OR place:seattle OR place:fd70c22040963ac7', 
                            tweet_mode='extended',
                            count=3).pages(3):
    tweets_pages.append(status)

for page in tweets_pages:
    for tweet in page:

        print(tweet.full_text)

@59Corvette1 @MakeTexasBlue22 Trump kid did some collaboration w Russian spy to get dirt on Clinton‚Ä¶
@MagNorris @Duderichy Watch the bear you might change your mind üòÇ
Stacking clips all weekend ‚ú®üõπüíÉüèª https://t.co/52IgUjOLkc
Pine Creek locked in a tight one with Ranch at the break. https://t.co/wgTr0fdvzU
The most painful thing that ever happened to me was watching Lost for the 5th time, but the first time as an adult, and realizing it‚Äôs not nearly as good as I thought it was. I‚Äôll be in my room
@its_m_reilly I wasn‚Äôt sure if it was Murphy or you. I hope that he‚Äôs much better!!
Summer Boulder https://t.co/yNH0GV0rzk


In [7]:
# Usefull tweet attributes [author(id, name, screen_name, location), user(id, name, screen_name, location), 
#                           geo, place, coordinates]
print(vars(tweet).keys())
print(vars(tweet.user).keys())
print(vars(tweet.place).keys())

dict_keys(['_api', '_json', 'created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'extended_entities', 'metadata', 'source', 'source_url', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'author', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'possibly_sensitive', 'lang'])
dict_keys(['_api', '_json', 'id', 'id_str', 'name', 'screen_name', 'location', 'description', 'url', 'entities', 'protected', 'followers_count', 'friends_count', 'listed_count', 'created_at', 'favourites_count', 'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count', 'lang', 'contributors_enabled', 'is_translator', 'is_translation_enabled', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_background_tile', 'profile_image_url', 'profile_imag

In [8]:
print(tweet.id) # Id dado tweet (string)
print(tweet.full_text) # Texto (string)
['Interesseira', 'vaca'] # Palavras chaves para extra√ß√£o (list)
print(tweet.created_at) # Data do tweet (timestamp)
print(tweet.user.id_str) # Id do usu√°rio (string)
print(tweet.user.name) # Nome completo (string)
print(tweet.user.screen_name) # Screen_name
print(tweet.user.profile_image_url) # Profile_image_url
print(tweet.user.created_at)# Created_at (timestamp)
print(tweet.place.id) # Id da localiza√ß√£o (string)
print(tweet.place.place_type) # Tipo do lugar (string)
print(tweet.place.name) # Nome (string)
print(tweet.place.country) # Pa√≠s (string)
print(tweet.place.country_code) # C√≥digo do pa√≠s (string)
print(tweet.place.bounding_box.coordinates) # Coordenadas (list)


1672707501186510848
Summer Boulder https://t.co/yNH0GV0rzk
2023-06-24 20:45:20+00:00
326843207
Yuta Notsu
astronomy_stars
http://pbs.twimg.com/profile_images/2240297038/274766_100003558210016_784855573_n_normal.jpg
2011-06-30 15:44:26+00:00
fd70c22040963ac7
city
Boulder
United States
US
[[[-105.3017759, 39.953552], [-105.183597, 39.953552], [-105.183597, 40.094411], [-105.3017759, 40.094411]]]


In [9]:
from data_scrap import extract_tweets

keywords_to_extract = ['Vagabunda']
extract_tweets(keywords_to_extract)

extracted_tweets_path = os.path.join(config.DATA_PATH_WRANGLE_TWEETS, 'extracted_tweets.parquet')
pd.read_parquet(extracted_tweets_path)

Object successfully saved to "C:\Users\DCandelero\Documents\MBA - USP(Data Analytics)\MBA_TCC\data\raw\tweets\1672728421217058816.pkl"


Unnamed: 0,id,text,lang,keywords_extraction,created_at,user_id,user_name,user_screen_name,user_profile_image_url,user_created_at,place_id,place_type,place_name,country,country_code,coordinates
0,1672728421217058816,RT @jjwanstar: Foi essa vagabunda aqui que com...,pt,"b'[""Vagabunda""]'",2023-06-24 22:08:28+00:00,1246271282141896704,lu‚Å∑ üíú Take Two üé∂ Haegeum ü•¢D-DAY üíô,lu_winterflower,http://pbs.twimg.com/profile_images/1670982460...,2020-04-04 03:00:51+00:00,,,,,,
