<a href="https://colab.research.google.com/github/Ajmyquira/tweets-topic-modelling/blob/master/2-preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tweet text preprocessing

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Reading the tweets dataset
tweets = pd.read_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/all_tweets.json", orient="split")
tweets

Unnamed: 0,date,text
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha..."
...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe..."


In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

Preprocessing part

In [None]:
# Remove RT @|# links
tweets['text_processed'] = tweets['text'].map(lambda x: re.sub('RT|cc','',x))
tweets['text_processed'] = tweets['text_processed'].map(lambda x: re.sub('([@#]([a-zA-Z0-9_]{1,50}))',"",x))
tweets['text_processed'] = tweets['text_processed'].map(lambda x: re.sub('http\S+','',x))

# Remove puntation signals
tweets['text_processed'] = tweets['text_processed'].map(lambda x: re.sub('[,\"|\'.!¿?]','',x))

# Lowercase
tweets['text_processed'] = tweets['text_processed'].map(lambda x: x.lower())

# Remove all non alphabetic tokens
tweets['text_processed'] = tweets['text_processed'].map(lambda x: ' '.join([word for word in word_tokenize(x) if word.isalpha()]))

# tweets.head(10)

Remove the accent mark, except the ñ

In [None]:
import re
from unicodedata import normalize

def accent_mark_remover(text):

  # -> NFD y eliminar diacríticos
  text = re.sub(
      r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
      normalize( "NFD", text), 0, re.I
    )

  # -> NFC
  text = normalize( 'NFC', text)

  return text

tweets['text_processed'] = tweets['text_processed'].map(lambda x: accent_mark_remover(x))

tweets


Unnamed: 0,date,text,text_processed
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...,hablas igual a diosdado cabello chavez maduro ...
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...,y gente como ud son los necios que a pesar de ...
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...,u exico pedro castillo desafia a peru con el n...
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...,nos mentiste el sr representa lo opuesto a tu ...
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha...",parece que a alguien le gusto escuchar que hab...
...,...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...,es un gobierno incapaz corrupto y ademas indol...
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...,ahora seria bueno que ud encabece una gran mar...
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs,jsjsjsjs
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe...",now playing pedro castillo aosto by pedro cast...


In [None]:
# Select just the text processed
tweets_processed = pd.DataFrame(tweets[['date','text_processed']])
tweets_processed.columns = ['date','text_processed']

In [None]:
# Checking the date columns datatype
type(tweets_processed['date'][0])

pandas._libs.tslibs.timestamps.Timestamp

First, download all spanish stop words in: https://github.com/Alir3z4/stop-words/blob/master/spanish.txt. And then remove the stop words.

In [None]:
with open('spanish.txt') as f:
  sw = f.read().splitlines()

new_stop_words = ('pai','ma','sr','ud','x','q','d',
                  'tambien','asi','sera','mas','ahi','ja','bla','aun',
                  'pedro','castillo','presidente')

# Adding the new stop words
for i in new_stop_words:
  sw.append(i)

# Removing the stop words
tweets_processed['text_processed'] = \
  tweets_processed['text_processed'].map(lambda x: ' '.join([word for word in word_tokenize(x) if not word in sw]))

tweets_processed

Unnamed: 0,date,text_processed
0,2021-07-28,hablas diosdado cabello chavez maduro castros
1,2021-07-28,gente necios evidencias siguen creyendo cuento...
2,2021-07-28,u exico desafia peru nombramiento gabinete rad...
3,2021-07-28,mentiste representa opuesto discurso abiertame...
4,2021-07-28,alguien gusto escuchar habra trenes discurso
...,...,...
2315278,2022-04-30,gobierno incapaz corrupto ademas indolente pueblo
2315279,2022-04-30,seria encabece marcha
2315280,2022-04-30,jsjsjsjs
2315281,2022-04-30,now playing aosto by


Make a copy of the tweets processed and delete the empty rows.

In [None]:
# Make a copy of the tweets processed and delete the empty rows
final_tweets = tweets_processed.copy()
final_tweets = final_tweets.loc[tweets_processed['text_processed'] != ""]
final_tweets

Unnamed: 0,date,text_processed
0,2021-07-28,hablas diosdado cabello chavez maduro castros
1,2021-07-28,gente necios evidencias siguen creyendo cuento...
2,2021-07-28,u exico desafia peru nombramiento gabinete rad...
3,2021-07-28,mentiste representa opuesto discurso abiertame...
4,2021-07-28,alguien gusto escuchar habra trenes discurso
...,...,...
2315278,2022-04-30,gobierno incapaz corrupto ademas indolente pueblo
2315279,2022-04-30,seria encabece marcha
2315280,2022-04-30,jsjsjsjs
2315281,2022-04-30,now playing aosto by


In [None]:
# Saving the tweets texts processed
final_tweets.to_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/tweets_text_processed.json", index=False, orient="split")