In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

tweets = pd.read_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/all_tweets.json", orient="split")
tweets

Unnamed: 0,date,text
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha..."
...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe..."


Extract the only the hashtags.

In [None]:
def extract_hashtags(text):
  string = ""

  for word in text.split():
    if word[0] == '#':
      string = string + " " + word[1:]

  string = string[1:]
  
  return string

tweets['text_processed'] = tweets['text'].map(lambda x: extract_hashtags(x))

tweets


Unnamed: 0,date,text,text_processed
0,2021-07-28,@PedroCastilloTe Hablas igual a Diosdado cabel...,
1,2021-07-28,@ebelinortiz @PedroCastilloTe @KeikoFujimori Y...,
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...,Perù México
3,2021-07-28,@Perulibreprensa NOS MENTISTE @PedroCastilloTe...,
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha...",28Julio BicentenarioPeru PedroCastillo Piura G...
...,...,...,...
2315278,2022-04-30,@RichardArcePeru @PedroCastilloTe Es un gobier...,
2315279,2022-04-30,@JorgeMunozPe @JNE_Peru Ahora sería bueno que ...,
2315280,2022-04-30,@NoDignos @PedroCastilloTe @congresoperu Jsjsjsjs,
2315281,2022-04-30,"Now playing Pedro Castillo aosto 13,2021 by Pe...",


Make a copy of the tweets processed and delete the empty rows.

In [None]:
hashtag_tweets = tweets.copy()
hashtag_tweets = hashtag_tweets.loc[tweets['text_processed'] != ""]
hashtag_tweets

Unnamed: 0,date,text,text_processed
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...,perù méxico
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha...",28julio bicentenarioperu pedrocastillo piura g...
7,2021-07-28,@PedroCastilloTe Nos reafirmaste que no eran T...,castillodesahuevate
11,2021-07-28,"Tercer día de gobierno, tercer día de fiesta. ...",asumetuvoto
25,2021-07-28,#Peru | President @PedroCastilloTe appointed 1...,peru
...,...,...,...
2315225,2022-04-30,#IStandWithCroacia Cc @PedroCastilloTe,istandwithcroacia
2315235,2022-04-30,"Los Croatas deben estar preocupados, gracias a...",castillorenunciaya fueracastillo castillorenun...
2315240,2022-04-30,@danielyovera @PedroCastilloTe @VLADIMIR_CERRO...,asambleaconstituyentenova
2315252,2022-04-30,@PazGuerraDelRio @LuzSalgado_R Panfletos y gen...,terruca asesino terrorita terrucos


Some preprocessing

In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize

nltk.download('popular')

In [None]:
# Lowercase
hashtag_tweets['text_processed'] = hashtag_tweets['text_processed'].map(lambda x: x.lower())
# Remove all non alphabetic tokens
hashtag_tweets['text_processed'] = hashtag_tweets['text_processed'].map(lambda x: ' '.join([word for word in word_tokenize(x) if word.isalpha()]))

In [None]:
import re
from unicodedata import normalize

def accent_mark_remover(text):
  
  # -> NFD y eliminar diacríticos
  text = re.sub(
      r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
      normalize( "NFD", text), 0, re.I
    )

  # -> NFC
  text = normalize( 'NFC', text)

  return text

hashtag_tweets['text_processed'] = hashtag_tweets['text_processed'].map(lambda x: accent_mark_remover(x))

First, download all spanish stop words in: https://github.com/Alir3z4/stop-words/blob/master/spanish.txt. And then remove the stop words.

In [None]:
with open('spanish.txt') as f:
  sw = f.read().splitlines()

new_stop_words = ('pai','ma','sr','ud','x','q','d',
                  'tambien','asi','sera','mas','ahi','ja','bla','aun',
                  'pedro','castillo','presidente')

for i in new_stop_words:
  sw.append(i)

hashtag_tweets['text_processed'] = \
  hashtag_tweets['text_processed'].map(lambda x: ' '.join([word for word in word_tokenize(x) if not word in sw]))

hashtag_tweets

Unnamed: 0,date,text,text_processed
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...,peru mexico
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha...",bicentenarioperu pedrocastillo piura gabineteb...
7,2021-07-28,@PedroCastilloTe Nos reafirmaste que no eran T...,castillodesahuevate
11,2021-07-28,"Tercer día de gobierno, tercer día de fiesta. ...",asumetuvoto
25,2021-07-28,#Peru | President @PedroCastilloTe appointed 1...,peru
...,...,...,...
2315225,2022-04-30,#IStandWithCroacia Cc @PedroCastilloTe,istandwithcroacia
2315235,2022-04-30,"Los Croatas deben estar preocupados, gracias a...",castillorenunciaya fueracastillo castillorenun...
2315240,2022-04-30,@danielyovera @PedroCastilloTe @VLADIMIR_CERRO...,asambleaconstituyentenova
2315252,2022-04-30,@PazGuerraDelRio @LuzSalgado_R Panfletos y gen...,terruca asesino terrorita terrucos


Make a copy (again) of the tweets processed and delete the empty rows.

In [None]:
final_tweets = hashtag_tweets.copy()
final_tweets = final_tweets.loc[hashtag_tweets['text_processed'] != ""]
final_tweets

Unnamed: 0,date,text,text_processed
2,2021-07-28,#Perù #México Pedro Castillo desafía a Perú co...,peru mexico
4,2021-07-28,"Parece que a alguien le gustó escuchar que ""ha...",bicentenarioperu pedrocastillo piura gabineteb...
7,2021-07-28,@PedroCastilloTe Nos reafirmaste que no eran T...,castillodesahuevate
11,2021-07-28,"Tercer día de gobierno, tercer día de fiesta. ...",asumetuvoto
25,2021-07-28,#Peru | President @PedroCastilloTe appointed 1...,peru
...,...,...,...
2315225,2022-04-30,#IStandWithCroacia Cc @PedroCastilloTe,istandwithcroacia
2315235,2022-04-30,"Los Croatas deben estar preocupados, gracias a...",castillorenunciaya fueracastillo castillorenun...
2315240,2022-04-30,@danielyovera @PedroCastilloTe @VLADIMIR_CERRO...,asambleaconstituyentenova
2315252,2022-04-30,@PazGuerraDelRio @LuzSalgado_R Panfletos y gen...,terruca asesino terrorita terrucos


Select just the text processed

In [None]:
final_tweets = pd.DataFrame(final_tweets[['date','text_processed']])
final_tweets.columns = ['date','text_processed']
final_tweets

Unnamed: 0,date,text_processed
2,2021-07-28,peru mexico
4,2021-07-28,bicentenarioperu pedrocastillo piura gabineteb...
7,2021-07-28,castillodesahuevate
11,2021-07-28,asumetuvoto
25,2021-07-28,peru
...,...,...
2315225,2022-04-30,istandwithcroacia
2315235,2022-04-30,castillorenunciaya fueracastillo castillorenun...
2315240,2022-04-30,asambleaconstituyentenova
2315252,2022-04-30,terruca asesino terrorita terrucos


Save the tweets text processed.

In [None]:
final_tweets.to_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/hashtag_processed.json", index=False, orient="split")

Check the tweets file.

In [None]:
test = pd.read_json("/content/drive/MyDrive/UCSP/Data-Science-Topics/Data/hashtag_processed.json", orient="split")
test

Unnamed: 0,date,text_processed
0,2021-07-28,peru mexico
1,2021-07-28,bicentenarioperu pedrocastillo piura gabineteb...
2,2021-07-28,castillodesahuevate
3,2021-07-28,asumetuvoto
4,2021-07-28,peru
...,...,...
222328,2022-04-30,istandwithcroacia
222329,2022-04-30,castillorenunciaya fueracastillo castillorenun...
222330,2022-04-30,asambleaconstituyentenova
222331,2022-04-30,terruca asesino terrorita terrucos
