# Data understanding

We will structure this section as follows:
1. [Collect data](#1-collect-data)
2. [Describe data](#2-describe-data)
3. [Explore data](#3-explore-data)

## 1. Collect data

In [None]:
import os
import tweepy as tw
import pandas as pd
import json
import credentials as cr

#Add your credentials here
twitter_keys = {
  'consumer_key': cr.api_key,
  'consumer_secret': cr.api_secret_key,
  'access_token_key': cr.acces_token,
  'access_token_secret': cr.acces_token_secret   
}

#Setup access to API
auth = tw.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token_key'], twitter_keys['access_token_secret'])

# setting the connection with the API
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
def making_the_query(search_query):

  '''
  recieve one parameter, the key word that we will use to filter the request 
  and returns a list with all the fetched tweets 
  '''
  query = search_query + ' medellin -filter:retweets' 
  date_since = "2018-11-16"  # date filter to search the tweets

  tweets = tw.Cursor(api.search,
                    q=query,
                    lang="es",
                    since=date_since,
                    tweet_mode="extended").items(2)

  list_of_tweets = [[tweet.full_text, 
                    tweet.user.screen_name, 
                    tweet.user.location,
                    tweet.created_at,
                    tweet.id,
                    tweet.retweet_count,
                    tweet.favorite_count] for tweet in tweets]
  
  return list_of_tweets

In [None]:
def create_the_df(list_of_tweets):

  '''
  Recieve one parameter, the list with all the tweets and return a dataFrame
  '''

  # creating the dic to fill with the tweets that we fetch
  diccionario = {
      'texto':[],
      'usuario':[],
      'ubicacion':[],
      'fecha':[],
      'tweet_id':[],
      'numero_rt':[],
      'numero_likes':[]
  }

  count = 0
  dict_to_fill = diccionario.copy()

  for row in range(len(list_of_tweets)):
    
    # agregando los datos al diccionario
    dict_to_fill['texto'].append(list_of_tweets[row][0])
    dict_to_fill['usuario'].append(list_of_tweets[row][1])
    dict_to_fill['ubicacion'].append(list_of_tweets[row][2])
    dict_to_fill['fecha'].append(list_of_tweets[row][3])
    dict_to_fill['tweet_id'].append(list_of_tweets[row][4])
    dict_to_fill['numero_rt'].append(list_of_tweets[row][5])
    dict_to_fill['numero_likes'].append(list_of_tweets[row][6])

  df = pd.DataFrame.from_dict(dict_to_fill)

  return df

In [None]:
# palabras claves seleccionadas para realizar la busqueda
palabras_claves = [
                   '"movilidad sostenible"', '"movilidad inteligente"', '"servicios publicos"',
                   '"energias alternativas"', '"reciclaje"', '"energias renovables"',
                   '"urbanimo ecologico"', '"urbanismo"', '"desarrollo rural"',
                   '"bienestar animal"', '"biodiversidad"', '"energias limpias"',
                   '"movilidad"','"reciclar"', '"energias"', '"rural"',
]

count = 0

# creando una consulta para cada palabra clave
for search_query in palabras_claves:

  list_of_tweets = making_the_query(search_query)
  df = create_the_df(list_of_tweets)
  # creando una nueva columna con la palabra clave
  df['palabra_clave'] = search_query.replace('"','')

  if count == 0:
    df_final = df.copy()
    count = 1
  else:
    df_final = df_final.append(df, ignore_index=True)

In [None]:
df_final

## 2. Describe data

## 3. Explore data