# In the notebook _Learning Twitter API_, we download 100 tweets from 100 people (Approx because some private users). Here we are going to preprocess the data

In [122]:
import os
import pandas as pd
from nltk.tokenize import TweetTokenizer
import nltk
import random
import string

In [123]:
# Collecting all paths
folderpath = "./tweets"
filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [124]:
tweets = {}
# Looping through the files and extracting their tweets
for path in filepaths:
    username = path.split('/')[2][:-4]
    try:
        # Grabbing just the tweets
        tweets[username] = list(pd.read_csv(path, sep=',', engine='python', names=[0,1,2])[2])
    #print(username)
    except:
        print(f'This user is empty {username}')

In [125]:
tokens = {}
# Tokenizing with nltk
tt = TweetTokenizer()
for username in tweets:
    tokens[username] = []
    for tweet in tweets[username]:
        tokenized_tweet = tt.tokenize(tweet.lower())
        tokens[username].append(tokenized_tweet) 

In [126]:
# Downloading the useless words in spanish
nltk.download("stopwords", quiet=True)
sw = nltk.corpus.stopwords.words("spanish")
sw += nltk.corpus.stopwords.words("english")

In [127]:
# We are going to clean punctuation sings too, i'm adding spanish ones
punc = string.punctuation + '‚Äì¬ø¬°‚Äù‚Äú'
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~‚Äì¬ø¬°‚Äù‚Äú'

In [128]:
# Adding a function that tells me if a string is a number
def is_number(string):
    try:
        float(string)
        return True
    except:
        return False

In [129]:
# Cleaning this useless words
filtered_tweets = {}
for username in tweets:
    filtered_tweets[username] = []
    for index, tweet in enumerate(tokens[username]):
        filtered_tweet = [word for word in tokens[username][index] if word not in sw and not is_number(word)]
        no_punc = [i for i in  filtered_tweet if i not in punc]
        filtered_tweets[username].append(list(filter(None, no_punc)))

In [130]:
# Just to compare all the tweets and the process with a random user
random_username = random.choice(list(tweets.keys()))
random_tweet = random.randint(0,4)
print(random_username, random_number)

print(tweets[random_username][random_number])
print()
print(tokens[random_username][random_number])
print()
print(filtered_tweets[random_username][random_number])

leon_orale 2
#sinmierdoal√©xito Te compartimos la historia de Mafer egresada de la generaci√≥n #17.
"Mi experiencia en √≥rale fue incre√≠ble y la sigue siendo aprend√≠ bastantes cosas, al conocer mis miedos, mis talentos ocultos y adem√°s a crecer tanto en mi persona como en mi campo laboral. https://t.co/50AzGpIbrZ

['#sinmierdoal√©xito', 'te', 'compartimos', 'la', 'historia', 'de', 'mafer', 'egresada', 'de', 'la', 'generaci√≥n', '#17', '.', '"', 'mi', 'experiencia', 'en', '√≥rale', 'fue', 'incre√≠ble', 'y', 'la', 'sigue', 'siendo', 'aprend√≠', 'bastantes', 'cosas', ',', 'al', 'conocer', 'mis', 'miedos', ',', 'mis', 'talentos', 'ocultos', 'y', 'adem√°s', 'a', 'crecer', 'tanto', 'en', 'mi', 'persona', 'como', 'en', 'mi', 'campo', 'laboral', '.', 'https://t.co/50azgpibrz']

['#sinmierdoal√©xito', 'compartimos', 'historia', 'mafer', 'egresada', 'generaci√≥n', '#17', 'experiencia', '√≥rale', 'incre√≠ble', 'sigue', 'siendo', 'aprend√≠', 'bastantes', 'cosas', 'conocer', 'miedos', 'talentos'

## La √∫ltima lista que se ve en la celda de arriba, es la lista final que usaremos para obtener de qu√© se trata el tweet, notese que est√°n oganizados por usuario y por n√∫mero de tweet c:, la lista se llama "filtered_tweets", y se supone que ese es todo el preprocesamiento que se necesita para aplicar lo que queremos C:

In [131]:
# Let's create a super simple model
# We are going to create a list of words and classify the tweets with this.
important_words = ['zool√≥gico', 'zoologico', 'zoo', 'museo', 'programaci√≥n', 'programacion', 'lectura', 'leer', 
                    'leyendo', 'libro', 'le√≠', 'lei', 'beca', 'becar', 'caf√©', 'cafe', 'canci√≥n', 'cantar', 'cancion',
                    'pel√≠culas', 'pelicula', 'pel√≠cula', 'peliculas', 'series', 'serie', 'm√∫sica', 'danza', 'musica', 'postres'
                  ]

In [132]:
# Defining a simple way to detect a word in a filtered tweet.
def is_here(words, tweet):
    detected_words = []
    for word in words:
        if word in tweet:
            detected_words.append(word)
    return detected_words

In [133]:
important_tweets = {}
for important_word in important_words:
    important_tweets[important_word] = []
for username in tweets:
    for index, tweet in enumerate(tweets[username]):
        detected_words = is_here(important_words, tokens[username][index])
        for detected_word in detected_words:
            important_tweets[detected_word].append(tweet + f'    by: {username}')

In [134]:
for key, tweets in important_tweets.items():
    if tweets:
        print(key.capitalize())
        for tweet in tweets:
            print(f'*{tweet}')
            print('------------------')
        print()
        print('____________________________________________')    

Zool√≥gico
*Un entrego mas de Dise√±os Green Day para el zool√≥gico de Le√≥n, Gto https://t.co/VvAnKm1ppI    by: DayDisenos
------------------

____________________________________________
Museo
*Colecci√≥n del Museo de Arqueolog√≠a del AHML.
Olla antropomorfa encontrada en San Miguel; figura femenina en posici√≥n de hinojos, tiene restos de pintura roja. Se expone en el AHML y en https://t.co/N3c98k4CKZ podr√°s conocer m√°s de estos testimonios prehisp√°nicos de la regi√≥n. https://t.co/97RWlewl5R    by: ahmleon
------------------
*RT @IECGuanajuato: El experimentado artista mexicano Benjam√≠n Romero Duarte es el pr√≥ximo hu√©sped del Museo Casa Diego Rivera. No te pierda‚Ä¶    by: EzraVldk
------------------
*RT @CulturaColectiv: Si el #D√≠aDeMuertos es tu fiesta nacional favorita, entonces este museo te va a encantar üíÄ https://t.co/ySxFyFWzdC    by: Fercalejandri
------------------

____________________________________________
Programaci√≥n
*RT @FIACmx: ¬°La IV Muestra de Danza C