In [1]:
import pandas as pd
import re
import emoji
from collections import Counter

In [2]:
df = pd.read_csv('tweets.csv')

In [3]:
def hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    return hashtags if hashtags else None


def usuarios(text):
    usuario = re.findall(r'@\w+', text)
    return usuario if usuario else None


def URL(text):
    urls = re.findall(r'https?://\S+', text)
    return urls if urls else None


def Emoji(text):
    return [char for char in text if char in emoji.EMOJI_DATA]


def Emoticones(text):
    emoticones = re.findall(r'(?::|;|=|X|x)(?:-)?(?:[3DPdpOoSs]|[()]{1,}|[d]{1,})', text)
    return emoticones if emoticones else None


def Horas(text):
    horas = re.findall(r'\b(?:[01]?\d|2[0-3]):[0-5]\d\b', text)
    return horas if horas else None

In [4]:
listaUsuarios = df['text'].apply(usuarios).dropna().sum()
listaHashtags = df['text'].apply(hashtags).dropna().sum()
listaURLS = df['text'].apply(URL).dropna().sum()
listaEmojis = df['text'].apply(Emoji).dropna().sum()
listaEmoticones = df['text'].apply(Emoticones).dropna().sum()
listaHoras = df['text'].apply(Horas).dropna().sum()

In [5]:
totalHashtags, totalUsuarios, totalURls, totalEmojis, totalEmoticones, totalHoras = Counter(listaHashtags), Counter(listaUsuarios), Counter(listaURLS), Counter(listaEmojis), Counter(listaEmoticones), Counter(listaHoras)
topHashtags, topUsuarios, topURLs, topEmojis, topEmoticones, topHoras = totalHashtags.most_common(10), totalUsuarios.most_common(10), totalURls.most_common(10), totalEmojis.most_common(10), totalEmoticones.most_common(10), totalHoras.most_common(10)

resultados = []
categorias = {
    'Hashtags' : (totalHashtags, topHashtags),
    'Usuarios' : (totalUsuarios, topUsuarios),
    'URLs' : (totalURls, topURLs),
    'Emojis' : (totalEmojis, topEmojis), 
    'Emoticones' : (totalEmoticones, topEmoticones), 
    'Horas' : (totalHoras,  topHoras)
}

for categoria, (totales, lostop) in categorias.items():
    top10 = ', '.join([f'{item} : {freq}' for item, freq in lostop])
    aux = pd.DataFrame({
        'String' : [categoria],
        'Frecuecia' : [sum(totales.values())],
        'Top 10' : [top10]
    })
    resultados.append(aux)

df_chido = pd.concat(resultados, ignore_index=True)
df_chido

Unnamed: 0,String,Frecuecia,Top 10
0,Hashtags,298,"#UnidosTodosX : 26, #DeZurdaTeam : 26, #GranHe..."
1,Usuarios,194,"@petrogustavo : 7, @DeZurdaTeam_ : 6, @JMilei ..."
2,URLs,8,https://www.eldiario.es/1_a4fa72?utm_campaign=...
3,Emojis,1161,"😭 : 67, 🥺 : 54, ❤ : 51, 🏻 : 38, 🙏 : 36, ✨ : 34..."
4,Emoticones,145,"xp : 76, xd : 19, xo : 15, :3 : 8, xs : 6, :) ..."
5,Horas,15,"23:58 : 4, 17:45 : 3, 11:11 : 2, 3:00 : 2, 2:3..."


In [7]:
topHoras, topEmojis, topEmoticones

([('23:58', 4),
  ('17:45', 3),
  ('11:11', 2),
  ('3:00', 2),
  ('2:38', 1),
  ('3:12', 1),
  ('4:20', 1),
  ('00:01', 1)],
 [('😭', 67),
  ('🥺', 54),
  ('❤', 51),
  ('🏻', 38),
  ('🙏', 36),
  ('✨', 34),
  ('🤣', 28),
  ('😂', 25),
  ('🏽', 22),
  ('🏼', 21)],
 [('xp', 76),
  ('xd', 19),
  ('xo', 15),
  (':3', 8),
  ('xs', 6),
  (':)', 4),
  ('Xo', 3),
  (':)))', 2),
  (':(', 2),
  ('xD', 2)])

In [9]:
topHashtags, topURLs

([('#UnidosTodosX', 26),
  ('#DeZurdaTeam', 26),
  ('#GranHermano', 21),
  ('#granhermano', 9),
  ('#gelp', 7),
  ('#OTDirecto5E', 4),
  ('#gh23', 4),
  ('#NadieComoTú', 3),
  ('#MicroCuento', 3),
  ('#Bailando2023', 3)],
 [('https://www.eldiario.es/1_a4fa72?utm_campaign=botonera-share&utm_medium=social&utm_source=twitter',
   2),
  ('https://elfaro.net/es/202312/columnas/27191/el-voto-de-la-frustracion-gano-el-tour-electoral-de-2023',
   1),
  ('https://www.youtube.com/watch?v=1QvBbg38UY8&ab_channel=ElMostrador', 1),
  ('https://nitter.perennialte.ch/MaxKaiser75/status/1743261455326007754/video/1',
   1),
  ('https://signal.group/#CjQKIAL1PfYMtji-3OMw24eFifKyZSI9bNbHpdvfWONAMrnvEhAgxrDIgXSX8-35VZTa6H_n',
   1),
  ('https://t.me/tierrasant', 1),
  ('https://twitter.com/MaxKaiser75/status/1743261455326007754/video/1', 1)])

In [10]:
topUsuarios

[('@petrogustavo', 7),
 ('@DeZurdaTeam_', 6),
 ('@JMilei', 4),
 ('@biobio', 3),
 ('@radiocarab', 3),
 ('@TTISantiago', 3),
 ('@mop_chile', 3),
 ('@mop_rm', 3),
 ('@MabelLaraNews', 2),
 ('@_somosmadrid', 2)]