# In the notebook _Learning Twitter API_, we download 100 tweets from 100 people (Approx because some private users). Here we are going to preprocess the data

In [1]:
import os
import pandas as pd
from nltk.tokenize import TweetTokenizer
import nltk
import random
import string

In [2]:
# Collecting all paths
folderpath = "./tweets"
filepaths  = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [3]:
tweets = {}
# Looping through the files and extracting their tweets
for path in filepaths:
    username = path.split('/')[2][:-4]
    try:
        # Grabbing just the tweets
        tweets[username] = list(pd.read_csv(path, sep=',', engine='python', names=[0,1,2])[2])
    #print(username)
    except:
        print(f'This user is empty {username}')

In [4]:
tokens = {}
# Tokenizing with nltk
tt = TweetTokenizer()
for username in tweets:
    tokens[username] = []
    for tweet in tweets[username]:
        tokenized_tweet = tt.tokenize(tweet)
        tokens[username].append(tokenized_tweet) 

In [5]:
# Downloading the useless words in spanish
nltk.download("stopwords", quiet=True)
sw = nltk.corpus.stopwords.words("spanish")
sw += nltk.corpus.stopwords.words("english")

In [6]:
# We are going to clean punctuation sings too, i'm adding spanish ones
punc = string.punctuation + '–¿¡”“'
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~–¿¡”“'

In [7]:
# Adding a function that tells me if a string is a number
def is_number(string):
    try:
        float(string)
        return True
    except:
        return False

In [8]:
# Cleaning this useless words
filtered_tweets = {}
for username in tweets:
    filtered_tweets[username] = []
    for index, tweet in enumerate(tokens[username]):
        filtered_tweet = [word for word in tokens[username][index] if word not in sw and not is_number(word)]
        no_punc = [i for i in  filtered_tweet if i not in punc]
        filtered_tweets[username].append(list(filter(None, no_punc)))

In [13]:
# Just to compare all the tweets and the process with a random user
random_username = random.choice(list(tweets.keys()))
random_number = random.randint(0,4)
print(random_username, random_number)

print(tweets[random_username][random_number])
print()
print(tokens[random_username][random_number])
print()
print(filtered_tweets[random_username][random_number])

CynthiaRLandin 1
RT @Proteccion_leon: Se da atención por reporte de inundación en la colonia Brisas del Campestre en la zona poniente de nuestro municipio,…

['RT', '@Proteccion_leon', ':', 'Se', 'da', 'atención', 'por', 'reporte', 'de', 'inundación', 'en', 'la', 'colonia', 'Brisas', 'del', 'Campestre', 'en', 'la', 'zona', 'poniente', 'de', 'nuestro', 'municipio', ',', '…']

['RT', '@Proteccion_leon', 'Se', 'da', 'atención', 'reporte', 'inundación', 'colonia', 'Brisas', 'Campestre', 'zona', 'poniente', 'municipio', '…']


## La última lista que se ve en la celda de arriba, es la lista final que usaremos para obtener de qué se trata el tweet, notese que están oganizados por usuario y por número de tweet c:, la lista se llama "filtered_tweets", y se supone que ese es todo el preprocesamiento que se necesita para aplicar lo que queremos C: