In [7]:
import re
import os
import json
import time
import string
import pandas as pd
import numpy as np
import tweepy
import spacy
import es_core_news_md
import es_core_news_sm
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from dotenv import load_dotenv

load_dotenv()


#Environment Variables
consumer_key = os.environ.get('CONSUMER_KEY')
consumer_secret = os.environ.get('CONSUMER_SECRET_KEY')
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [2]:
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError as e:
            print(e)
            print('Waiting for 15 minutes to continue making requests')
            time.sleep(15 * 60)
            

In [3]:
#User Timeline to a DataFrame
tweets = tweepy.Cursor(api.user_timeline, id="helsinkiespana", tweet_mode='extended', include_rts=False, exclude_replies=True).items()
tweets_lst = [tweet._json for tweet in tweets]
df_tweets = pd.DataFrame(tweets_lst)

df_tweets.to_csv(f'./output/timeline_helsinkiespana.csv', index=False)
df_tweets.head()

NameError: name 'api' is not defined

In [60]:
#User Hashtags DataFrame
hashtags = [[hashtag['text'] 
                    for hashtag in hashtags['hashtags']] 
                    for hashtags in df['entities']]

df_hashtags = pd.DataFrame({
    'id':np.repeat(df['id'].values, df['hashtags'].str.len()),
    'hashtags':np.concatenate(df['hashtags'].values)
})
hashtags_count = df_hashtags['hashtags'].value_counts().to_dict()
df_hashtags['n_hashtag'] = df_hashtags['hashtags'].map(hashtags_count)

#Adding Lemmatisation
lemm = spacy.blank('es')
df_hashtags['lemmas'] = [lemm(lemma).text for lemma in df_hashtags['hashtags']]
lemmas_count = df_hashtags['lemmas'].value_counts().to_dict()
df_hashtags['lemmas_count'] = df_hashtags['lemmas'].map(lemmas_count)

df_hashtags.to_csv('./output/hashtags_helsinkiespana.csv', index=False)
df_hashtags.head()

Unnamed: 0,id,hashtags,n_hashtag,lemmas,lemmas_count
0,1243156489956732930,HelsinkiEspaña,7,HelsinkiEspaña,7
1,1243156489956732930,ddhh,2,ddhh,2
2,1243156489956732930,JovenesparaJovenes,2,JovenesparaJovenes,2
3,1243156489956732930,malaga,3,malaga,3
4,1243156489956732930,oviedo,2,oviedo,2


In [12]:
#User Followers DataFrame
def requestFollowers(username, )
followers = tweepy.Cursor(api.followers, id="helsinkiespana", count=200).items()
followers_lst = [follower._json for follower in followers]
df_followers = pd.DataFrame(followers_lst)
df_followers.to_csv(f'./output/followers_helsinkiespana.csv', index=False)
df_followers.head()

In [189]:
#Search
tweets = tweepy.Cursor(api.search,  
                       tweet_mode='extended', 
                       q="""
                     derechoshumanos OR madrid OR gratis OR empleo OR humanrightsday OR reactcourse OR jovenesparajovenes OR humanrights OR humandimensionday OR react OR webinarsrio14 OR penademuerte OR ddhh OR caminodesantiago OR soles4globalgoals OR ciudadaniaue OR peacekeeping OR laeuropadelosjovenes OR crimenorganizado OR seguridadhumana OR #derechoshumanos OR #madrid OR #gratis OR #empleo OR #humanrightsday OR #reactcourse OR #jovenesparajovenes OR #humanrights OR #humandimensionday OR #react OR #webinarsrio14 OR #penademuerte OR #ddhh OR #caminodesantiago OR #soles4globalgoals OR #ciudadaniaue OR #peacekeeping OR #laeuropadelosjovenes OR #crimenorganizado OR #seguridadhumana
                     """,
                       result_type='recent',
                      ).items()


In [175]:
len(search)

50

In [176]:
set_words = [set(re.findall(r'#\w*', unicodedata.normalize(
            'NFKD', e.full_text).encode('ASCII', 'ignore').decode('utf-8').lower())) for e in search]

In [177]:
set_words

[set(),
 {'#__', '#humanrights', '#notoexecution', '#taheri_moveme'},
 set(),
 {'#',
  '#canadian',
  '#citizens',
  '#humanrights',
  '#illegals',
  '#legal',
  '#porters',
  '#rcmp',
  '#roxhamroad',
  '#stripped'},
 set(),
 {'#covid19', '#madrid'},
 {'#empleo'},
 set(),
 {'#20minutos', '#madrid'},
 {'#ddhh'},
 set(),
 set(),
 {'#react', '#youtube', '#youtubersreact'},
 {'#biodiversidad'},
 {'#__', '#humanrights', '#notoexecution', '#taheri_moveme'},
 set(),
 {'#carceldeguanare', '#derechoshumanos', '#ong'},
 {'#chihuahua', '#elalamo', '#madrid', '#perdida'},
 {'#empleo', '#felizlunes', '#reclutamiento', '#rrhh'},
 {'#humanrights'},
 {'#empleo', '#it', '#java', '#madrid', '#tic'},
 set(),
 set(),
 {'#farmacias', '#madrid', '#policia', '#sanse'},
 {'#react', '#youtube', '#youtubersreact'},
 {'#100daysofcode',
  '#angular',
  '#developers',
  '#frontend',
  '#javascript',
  '#react'},
 {'#react', '#youtube', '#youtubersreact'},
 {'#adopci', '#madrid'},
 {'#gratis', '#pagina', '#web'},


In [186]:
top_hashtags = ['#derechoshumanos',
 '#madrid',
 '#gratis',
 '#empleo',
 '#humanrightsday',
 '#reactcourse',
 '#jovenesparajovenes',
 '#humanrights',
 '#humandimensionday',
 '#react',
 '#webinarsrio14',
 '#penademuerte',
 '#ddhh',
 '#caminodesantiago',
 '#soles4globalgoals',
 '#ciudadaniaue',
 '#peacekeeping',
 '#laeuropadelosjovenes',
 '#crimenorganizado',
 '#seguridadhumana']

In [187]:
top_hashtags = [e[1:] for e in top_hashtags] + top_hashtags

In [188]:
" OR ".join(top_hashtags)

'derechoshumanos OR madrid OR gratis OR empleo OR humanrightsday OR reactcourse OR jovenesparajovenes OR humanrights OR humandimensionday OR react OR webinarsrio14 OR penademuerte OR ddhh OR caminodesantiago OR soles4globalgoals OR ciudadaniaue OR peacekeeping OR laeuropadelosjovenes OR crimenorganizado OR seguridadhumana OR #derechoshumanos OR #madrid OR #gratis OR #empleo OR #humanrightsday OR #reactcourse OR #jovenesparajovenes OR #humanrights OR #humandimensionday OR #react OR #webinarsrio14 OR #penademuerte OR #ddhh OR #caminodesantiago OR #soles4globalgoals OR #ciudadaniaue OR #peacekeeping OR #laeuropadelosjovenes OR #crimenorganizado OR #seguridadhumana'

In [179]:
for e in set_words:
    intersect = e.intersection(top_hashtags)
    print(intersect)

set()
{'#humanrights'}
set()
{'#humanrights'}
set()
{'#madrid'}
{'#empleo'}
set()
{'#madrid'}
{'#ddhh'}
set()
set()
{'#react'}
set()
{'#humanrights'}
set()
{'#derechoshumanos'}
{'#madrid'}
{'#empleo'}
{'#humanrights'}
{'#madrid', '#empleo'}
set()
set()
{'#madrid'}
{'#react'}
{'#react'}
{'#react'}
{'#madrid'}
{'#gratis'}
set()
{'#madrid'}
{'#madrid'}
{'#madrid'}
{'#madrid'}
set()
set()
set()
{'#gratis'}
{'#humanrights'}
{'#madrid'}
set()
{'#ddhh'}
set()
{'#humanrights'}
set()
{'#humanrights'}
set()
{'#madrid'}
set()
{'#empleo'}


In [156]:
inters

{'#derechoshumanos'}