# Pacotes necessários:

In [1]:
import glob
import gzip
import bz2
import collections
from bson import json_util
from datetime import datetime, timedelta
import json
from geopy.geocoders import Nominatim

# Pasta com os arquivos: /tweets/*bz2

In [2]:
# Os arquivos devem ser do formato bzip e gzip

In [5]:
# folder = '/tweets/*bz2'
folder = '/scratch/pedrobrum/tweets/dados/tweets/*bz2'
files = glob.glob(folder)
files.sort()

In [6]:
files

['/scratch/pedrobrum/tweets/dados/tweets/tweets_1.bz2',
 '/scratch/pedrobrum/tweets/dados/tweets/tweets_2.bz2',
 '/scratch/pedrobrum/tweets/dados/tweets/tweets_3.bz2']

In [7]:
'''
    Abre um arquivo compactado (gzip ou bzip).
'''
def open_zipfile(file):

    if file[-2:] == 'gz':
        zip_file = gzip.GzipFile(file, 'r')
    else:
        zip_file = bz2.BZ2File(file, 'r')

    return zip_file

In [8]:
files[0]

'/scratch/pedrobrum/tweets/dados/tweets/tweets_1.bz2'

In [9]:
'''
    Retorna a diferença entre duas datas em semanas.
'''
def week_difference(d1, d2):
    
    monday1 = (d1 - timedelta(days=d1.weekday()))
    monday2 = (d2 - timedelta(days=d2.weekday()))
    weeks = (monday2 - monday1).days/7

    return weeks

In [11]:
i = 0           # contador
size = 1000     # número máximo de tweets

file = files[0]                # arquivo
zip_file = open_zipfile(file)  # abre o arquivo

week = 0                       # contador de semanas
last_date = None               # última data

tweets_week = collections.defaultdict(int) # dicionário de semanas (semana -> número de tweets) 
users = set()                              # conjunto de usuários 
texts = set()                              # conjunto de tweets
locations = collections.Counter()          # dicionário de locais (local -> número de tweets)
geolocator = Nominatim(user_agent="covid-tweets")

tweets = []    # lista de tweets. Obs: Não tente armazenar todos os tweets em uma lista. O volume de tweets é muito grande.

for line in zip_file:
    try:
        tweet = json_util.loads(line)
    except:
        print(line)
        continue
        
    '''
        Retonar a localização de um tweet a partir de coordenadas geográficas.
        
        coor (tuple): coordenadas geográficas (latitude, longitude).
    '''
    def city_state_country(coord):
        try:
            location = geolocator.reverse(coord, exactly_one=True, timeout=36000000)
            address = location.raw['address']
            city = address.get('city', '')
            state = address.get('state', '')
            country = address.get('country', '')
        except:
            city = ''
            state = ''
            country = ''

        return city, state, country

    date = tweet['created_at']
    print(date)

    user = tweet['user']['id']
    name = tweet['user']['name']
    user_screenname = tweet['user']['screen_name']

    # verifica se o tweet possui a versão extendida
    if "extended_tweet" in tweet:
        text = tweet['extended_tweet']['full_text']
    else:
        text = tweet['text']
        
    tweets.append(tweet)
    users.add(user)
    texts.add(text)

    # seleciona a localização do tweet. Obs: essa parte demora um pouco mais para executar. Por isso está comentada. 
#     if tweet['coordinates'] != None:
#         coord = tweet['coordinates']['coordinates']
#         city, state, country = city_state_country((coord[1], coord[0]))
#         if city == '':
#             locations.update([tweet['user']['location']])
#         elif city != None:
#             locations.update([city])
#     else:
#         locations.update([tweet['user']['location']])
    
    # atualiza a última data
    if last_date == None:
        last_date = date

    week_dif = week_difference(last_date, date)    # calcula a diferença em semanas entre a data do último tweet e do tweet atual
    if week_dif == 1:
        week += 1
        last_date = date

    tweets_week[week] += 1
        
    i += 1
    if i == size:
        break
    

2020-04-23 17:49:58+00:00
2020-04-23 17:49:58+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:01+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:02+00:00
2020-04-23 17:50:03+00:00
2020-04-23 17:50:03+00:00
2020-04-23 17:50:03+00:00
2020-04-23 1

2020-04-23 17:50:40+00:00
2020-04-23 17:50:40+00:00
2020-04-23 17:50:40+00:00
2020-04-23 17:50:40+00:00
2020-04-23 17:50:40+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:41+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:42+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 17:50:43+00:00
2020-04-23 1

In [12]:
users

{4418531,
 12807472,
 16355572,
 16462733,
 17163446,
 19235020,
 20510997,
 20781852,
 21615585,
 23051540,
 26983989,
 27040606,
 27109780,
 29956706,
 30921093,
 33867711,
 34005209,
 34594381,
 35508360,
 36393506,
 36631550,
 39581496,
 40296833,
 40362157,
 41158501,
 41843485,
 44258331,
 45381039,
 46683729,
 48089541,
 49112650,
 49149706,
 50446640,
 50641597,
 51412407,
 51761051,
 52898004,
 53394005,
 53395145,
 54158987,
 54378588,
 54588631,
 55608096,
 55623525,
 55706512,
 56294171,
 56582821,
 56708166,
 57061027,
 57068165,
 57308537,
 58479076,
 59167440,
 59673737,
 59762228,
 59773459,
 59814067,
 60109679,
 60236753,
 61519771,
 61918996,
 62161246,
 62361580,
 62660383,
 62867361,
 63726187,
 65232188,
 66694940,
 67591877,
 67699487,
 67778641,
 68247637,
 68511763,
 69063653,
 69120516,
 69708117,
 71737592,
 71899047,
 72125325,
 74258993,
 76025963,
 76112306,
 77474316,
 77919944,
 78093615,
 80124392,
 80452931,
 82349516,
 82676647,
 84686232,
 86059602,


In [13]:
texts

{'"Acabou a quarentena, cidadãos podem circular livremente nas ruas"',
 '"Dia 1.0000 de quarentena e eu não aguento mais meu pai me usando no tik tok dele" @phclaro  #tiktokbr #quarenteners https://t.co/KwFMa6ingr',
 '"EaD" e "Educação emergencial com tecnologias digitais em tempos de quarentena" são a mesma coisa???',
 '"Tudo que você precisa é de um pouco de espaço e um pouco de tempo - alguma solidão autoimposta e confinamento temporário"\n\nPode não ser o caso pois estou só agora, não por querer, mas entendendo a importância de estar.\n\nOutra fala aí é: pratique a procrastinação produtiva.\n\nVamo https://t.co/qCGvHdrgzj',
 '#Coronavirus\nConvido (e duvido) o Sr. Governador Dória a andar pela avenida Paulista sem escolta militar para ver como ele é querido e amado pelo povo paulistano.',
 '#DORIALULDRÃO casaram! Felicidades!!!',
 '#LulaNaCadeia  ,,. #MaiaTraidorNacional  .., Fora !!!!!!!!!!!!!!!!!!!!     ..,',
 '#LulaNaCadeia corrupção é genocídio parcelado mata à  longo prazo',
 

In [14]:
len(tweets)

1000

In [18]:
tweets[0]

{'_id': 1253380606924374017,
 'favorited': False,
 'quote_count': 0,
 'contributors': None,
 'truncated': False,
 'text': 'RT @BolsonaroSP: Boa notícia no combate à pandemia: a partir de hoje, foi permitido o uso de cloroquina em pacientes com sintomas leves de…',
 'is_quote_status': False,
 'in_reply_to_status_id': None,
 'reply_count': 0,
 'user': {'follow_request_sent': None,
  'profile_use_background_image': True,
  'id': 55623525,
  'verified': False,
  'translator_type': 'none',
  'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1185725305291509760/8aTNc1B-_normal.jpg',
  'profile_sidebar_fill_color': '95E8EC',
  'is_translator': False,
  'geo_enabled': True,
  'profile_text_color': '3C3940',
  'followers_count': 843,
  'protected': False,
  'location': None,
  'default_profile_image': False,
  'id_str': '55623525',
  'utc_offset': None,
  'statuses_count': 26135,
  'description': 'Busque sempre a vdd, pois ela te alimentará e te fortalecerá. Não tenha medo de mud

In [24]:
print(json.dumps(tweets[0], sort_keys=True, indent=4, default=str, ensure_ascii=False))

{
    "_id": 1253380606924374017,
    "contributors": null,
    "control": {
        "coleta": [
            439
        ]
    },
    "coordinates": null,
    "created_at": "2020-04-23 17:49:58+00:00",
    "entities": {
        "hashtags": [],
        "symbols": [],
        "urls": [],
        "user_mentions": [
            {
                "id": 74756085,
                "id_str": "74756085",
                "indices": [
                    3,
                    15
                ],
                "name": "Eduardo Bolsonaro🇧🇷",
                "screen_name": "BolsonaroSP"
            }
        ]
    },
    "favorite_count": 0,
    "favorited": false,
    "filter_level": "low",
    "geo": null,
    "id": 1253380606924374017,
    "id_str": "1253380606924374017",
    "in_reply_to_screen_name": null,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "is_quote_status": false,
    "lang": "