In [62]:
import pandas as pd
pd.set_option("display.max_columns",999)
import numpy as np
import json

# Getting some info from JSON file

## Data set preparation, run everything to get from scratch

In [63]:
# ## Data set preparation, run everything to get from scratch

def extractLoc(tweet,root=''):
    if root!='':
        root=root+'_' # si hay algo apendarle _
    tweetLoc={}
    if 'user' in tweet and 'location' in tweet['user']:
        tweetLoc[root+'location']=tweet['user']['location']
        tweetLoc[root+'time_zone']=tweet['user']['time_zone']
    else:
        tweetLoc[root+'location']=None
        tweetLoc[root+'time_zone']=None
    # try to geolocate the tweet
    if 'coordinates' in tweet and tweet['coordinates']!=None:
        tweetLoc[root+'lat']=tweet['coordinates']['coordinates'][0]
        tweetLoc[root+'lon']=tweet['coordinates']['coordinates'][1]
    else:
        tweetLoc[root+'lat']=None
        tweetLoc[root+'lon']=None
        
    if 'place' in tweet and tweet['place']!=None:
        tweetLoc[root+'country']=tweet['place']['country']
        tweetLoc[root+'place']=tweet['place']['full_name']
    else:
        tweetLoc[root+'country']=None
        tweetLoc[root+'place']=None 
    return tweetLoc

def extractUser(tweet,root=''):
    if root!='':
        root=root+'_' # si hay algo apendarle _
    tweetUser={}
    attrs_to_extract = ['id','name','screen_name','followers_count','statuses_count','created_at']
    for attr in attrs_to_extract:
        if 'user' in tweet and attr in tweet['user']:
            tweetUser[root+'user_'+attr]=tweet['user'][attr]  
        else:
            tweetUser[root+'user_'+attr]=None
    return tweetUser

def extractGeneralInfo(tweet,root=''):
    if root!='':
        root=root+'_' # si hay algo apendarle _
    tweetInfo={}
    attrs_to_extract = ['id','retweet_count','favorite_count','full_text','quote_count','created_at']
    for attr in attrs_to_extract:
        tweetInfo[root+attr]=tweet[attr] if attr in tweet else None
    return tweetInfo
        
# info from https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
def extractInfo(tweet):
    tweetInfo={}

    # basic tweet info
    tweetInfo.update(extractGeneralInfo(tweet))
    # get tweet location
    tweetInfo.update(extractLoc(tweet))
    # This tweet user data
    tweetInfo.update(extractUser(tweet))

    # type of tweet and get parent tweet included
    if tweet['in_reply_to_status_id']!=None: # reply
        tweetInfo['tweet_type']='reply'
        subtweet={ 'id': tweet['in_reply_to_status_id'], 'user': {'id': tweet['in_reply_to_user_id']}}        
    elif 'quoted_status' in tweet: # quote
        tweetInfo['tweet_type']='quote'
        subtweet = tweet['quoted_status']
        subtweet['id']=tweet['quoted_status_id']                      
    elif'retweeted_status' in tweet and tweet['retweeted_status']!=None: # retweet
        tweetInfo['tweet_type']='retweet'
        subtweet = tweet['retweeted_status']
    else: # iriginal
        tweetInfo['tweet_type']='original'
        subtweet={}
          
    # get subtweet  data
    tweetInfo.update(extractGeneralInfo(subtweet,'parent'))
    # get subtweet user data
    tweetInfo.update(extractUser(subtweet,'parent'))    
    # get subtweet location
    tweetInfo.update(extractLoc(subtweet,'parent'))
                    
    return tweetInfo

In [42]:
tweets={}
tweetfile=open("../data/Aniol-Maria-cuentalo-search-20180427_20180513.jsonl")
for idx,line in enumerate(tweetfile):
    if idx>100000000: # use for testing
        break
    tweet=json.loads(line)
    tweets[tweet['id']]=extractInfo(tweet)


AttributeError: 'NoneType' object has no attribute 'keys'

In [8]:
tweet['quoted']

{'contributors': None,
 'truncated': False,
 'is_quote_status': False,
 'in_reply_to_status_id': None,
 'id': 995091244043862017,
 'favorite_count': 0,
 'full_text': 'RT @brisavasquez: Es largo pero ojalá alguien quiera leerlo. Me canse de tenerle miedo y callarme. #Cuéntalo https://t.co/rMGzW8iN5M',
 'entities': {'symbols': [],
  'user_mentions': [{'id': 865118598,
    'indices': [3, 16],
    'id_str': '865118598',
    'screen_name': 'brisavasquez',
    'name': 'brisa'}],
  'hashtags': [{'indices': [99, 108], 'text': 'Cuéntalo'}],
  'urls': [],
  'media': [{'source_user_id': 865118598,
    'source_status_id_str': '993982598757146627',
    'expanded_url': 'https://twitter.com/brisavasquez/status/993982598757146627/photo/1',
    'display_url': 'pic.twitter.com/rMGzW8iN5M',
    'url': 'https://t.co/rMGzW8iN5M',
    'media_url_https': 'https://pbs.twimg.com/media/DctV5DpW0AAX7vK.jpg',
    'source_user_id_str': '865118598',
    'source_status_id': 993982598757146627,
    'id_str': '9939825

In [5]:
df=pd.DataFrame(tweets).transpose()
len(df)

2111998

In [6]:
df.dtypes

country                  object
created_at               object
favorite_count           object
full_text                object
id                       object
lat                      object
location                 object
lon                      object
parent_country           object
parent_created_at        object
parent_id                object
parent_lat               object
parent_location          object
parent_lon               object
parent_place             object
parent_text              object
parent_time_zone         object
parent_user_followers    object
parent_user_id           object
place                    object
quote_count              object
retweet_count            object
statuses_count           object
time_zone                object
tweet_type               object
user_followers_count     object
user_id                  object
user_screen_name         object
dtype: object

In [7]:
df['created_at']=pd.to_datetime(df.created_at,infer_datetime_format=True)
df['parent_created_at']=pd.to_datetime(df.parent_created_at,infer_datetime_format=True)

In [8]:
df.retweet_count=df.fillna(-1).retweet_count.astype(np.int32)
df.quote_count=df.quote_count.fillna(-1).astype(np.int32)
df.statuses_count=df.statuses_count.fillna(-1).astype(np.int32)
df.user_followers_count=df.user_followers_count.fillna(-1).astype(np.int32)
df.parent_user_followers=df.parent_user_followers.fillna(-1).astype(np.int32)

df.lat=df.lat.fillna(-999).astype(np.float32)
df.lon=df.lon.fillna(-999).astype(np.float32)

In [9]:
df.id=df.fillna(0).id.astype(np.int64)
df.parent_id=df.parent_id.fillna(0).astype(np.int64)
df.parent_user_id=df.parent_user_id.fillna(0).astype(np.int64)
df.user_id=df.user_id.fillna(0).astype(np.int64)


In [10]:
df.head()

Unnamed: 0,country,created_at,favorite_count,full_text,id,lat,location,lon,parent_country,parent_created_at,parent_id,parent_lat,parent_location,parent_lon,parent_place,parent_text,parent_time_zone,parent_user_followers,parent_user_id,place,quote_count,retweet_count,statuses_count,time_zone,tweet_type,user_followers_count,user_id,user_screen_name
990380137937035265,,2018-04-28 23:59:59,0,RT @Odi_sea: Con 17 años desde una cena de gra...,990380137937035265,-999.0,Venezuela,-999.0,,2018-04-28 00:41:56,990028305704333312,,,,,Con 17 años desde una cena de graduación un ho...,Amsterdam,301,112541236,,-1,3176,48132,Caracas,retweet,1839,636716479,OrianaDiAmeliee
990380135638593538,,2018-04-28 23:59:59,0,RT @__Kylene: Demos voz y apoyo a esta compañe...,990380135638593538,-999.0,"Baker Street, London.",-999.0,,2018-04-28 14:57:54,990243714584010752,,Lleida,,,Demos voz y apoyo a esta compañera que me ha p...,Athens,2483,814900356,,-1,1208,9376,,retweet,249,546250792,alannih
990380135500181505,,2018-04-28 23:59:59,0,RT @AlejandraTuk: Mi marido me maltrata.\nPido...,990380135500181505,-999.0,,-999.0,,2018-04-28 20:21:05,990325049893801984,,Latinoamérica Libre ☭,,,Mi marido me maltrata.\nPido el divorcio.\nEl ...,Eastern Time (US & Canada),5824,529588816,,-1,17406,10839,,retweet,132,796195177400532993,DiKa2Pa21
990380134715871233,,2018-04-28 23:59:59,0,RT @Antiintermedio: ¿Sabéis dónde estaban hace...,990380134715871233,-999.0,"Cáceres, España",-999.0,,2018-04-27 21:50:49,989985242713919488,,En el combate metapolítico,,,¿Sabéis dónde estaban hace 4 años y medio los ...,,18184,2345903802,,-1,1322,41093,,retweet,213,936556720670863360,FranciscoJMArau
990380130362187776,,2018-04-28 23:59:58,0,RT @Soy_Arbitra: Estar en un campo de fútbol a...,990380130362187776,-999.0,"Vallecas, Madrid",-999.0,,2018-04-28 16:07:55,990261338810257408,,España,,,"Estar en un campo de fútbol arbitrando, escuch...",Athens,1676,1538698825,,-1,573,209,,retweet,41,2734383970,geeeelaaaaa


In [11]:
#generar fichero
df.to_pickle("../pickles/cuentalo_json_to.pkl")

In [12]:
df.columns

Index(['country', 'created_at', 'favorite_count', 'full_text', 'id', 'lat',
       'location', 'lon', 'parent_country', 'parent_created_at', 'parent_id',
       'parent_lat', 'parent_location', 'parent_lon', 'parent_place',
       'parent_text', 'parent_time_zone', 'parent_user_followers',
       'parent_user_id', 'place', 'quote_count', 'retweet_count',
       'statuses_count', 'time_zone', 'tweet_type', 'user_followers_count',
       'user_id', 'user_screen_name'],
      dtype='object')

In [13]:
len(df)

2111998

# tweets originals faltantes

In [44]:
df=pd.read_pickle("../pickles/cuentalo_clean_1.pkl")

In [45]:
parents=set(df.parent_id.values)
len(parents)

81998

In [46]:
originals=set(df[df.tweet_type!='retweet'].id.unique())
len(originals)

151245

In [47]:
retweets=df[df.tweet_type=='retweet'].id.unique()
len(retweets)

1973203

In [48]:
len(parents.intersection(originals)) # los que tengo

64463

In [49]:
len(parents.difference(originals)) 

17535

In [50]:
faltantes=open('../intermediate_data/tweets_faltantes_2.csv','w')
for tid in parents.difference(originals):
    if tid!=0:
        faltantes.write(str(tid)+'\n')
faltantes.close()

In [52]:
t1=pd.read_csv('../data/tweets_faltantes.csv')
t2=pd.read_csv('../intermediate_data/tweets_faltantes_2.csv')

In [58]:
s1=set(t1['990801260013027329'].values)
s2=set(t2['990801260013027329'].values)

In [61]:
(s1.issubset(s2))

False

In [69]:

tweetfile=open("../data/cuentalo_faltantes_2.jsonl")
for idx,line in enumerate(tweetfile):
    if idx>100000000: # use for testing
        break
    tweet=json.loads(line)
    tweets[tweet['id']]=extractInfo(tweet)
    if tweets[tweet['id']]['tweet_type']=='retweet':
        print (tweets[tweet['id']]['parent_id'])

990730270876799000
990652353949978600
990724574722646000
990015958168326100
990698257016918000
990810943658410000
990754141021487100
989613217390325800
