# Análisis de Redes Sociales

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import json
import os
import re
import nltk
from nltk.corpus import stopwords
import string
from unidecode import unidecode

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\casti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2. Carga de Archivos

In [3]:
def load_tweets(file_path):
    tweets = []
    with open(file_path, 'r', encoding='utf-16') as file:
        for line in file:
            line = line.strip()
            tweets.append(json.loads(line))
    return pd.DataFrame(tweets)

In [4]:
df = load_tweets('data/tioberny.txt')

In [5]:
df.head()

Unnamed: 0,id,id_str,url,date,user,lang,rawContent,replyCount,retweetCount,likeCount,...,coordinates,inReplyToTweetId,inReplyToTweetIdStr,inReplyToUser,source,sourceUrl,sourceLabel,media,card,_type
0,1834281080029110288,1834281080029110288,https://x.com/La_ReVoluZzion/status/1834281080...,2024-09-12 17:21:03+00:00,"{'id': 1435062946598694914, 'id_str': '1435062...",es,"_\nConfirmado Compañeres,\n\nEl impuesto por l...",0,0,0,...,,,,,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,"{'photos': [], 'videos': [], 'animated': []}",,snscrape.modules.twitter.Tweet
1,1834252464092069901,1834252464092069901,https://x.com/XelaNewsGt/status/18342524640920...,2024-09-12 15:27:20+00:00,"{'id': 956650778634145792, 'id_str': '95665077...",es,#URGENTE Lo que los medios #faferos no informa...,12,80,142,...,,,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,"{'photos': [], 'videos': [{'thumbnailUrl': 'ht...",,snscrape.modules.twitter.Tweet
2,1834280919336976681,1834280919336976681,https://x.com/M24095273/status/183428091933697...,2024-09-12 17:20:25+00:00,"{'id': 1087057038755143680, 'id_str': '1087057...",es,@IvanDuque @BArevalodeLeon Con que usaste PEGA...,0,0,0,...,,1.834197e+18,1.834197215415599e+18,"{'id': 77653794, 'id_str': '77653794', 'userna...","<a href=""https://mobile.twitter.com"" rel=""nofo...",https://mobile.twitter.com,Twitter Web App,"{'photos': [], 'videos': [], 'animated': []}",,snscrape.modules.twitter.Tweet
3,1834280512933732694,1834280512933732694,https://x.com/carlosalbesc/status/183428051293...,2024-09-12 17:18:48+00:00,"{'id': 2881001877, 'id_str': '2881001877', 'ur...",es,@IvanDuque @BArevalodeLeon Entre Ellos se enti...,0,0,0,...,,1.834197e+18,1.834197215415599e+18,"{'id': 77653794, 'id_str': '77653794', 'userna...","<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,"{'photos': [], 'videos': [], 'animated': []}",,snscrape.modules.twitter.Tweet
4,1834279986254987428,1834279986254987428,https://x.com/Brenda_AGN/status/18342799862549...,2024-09-12 17:16:42+00:00,"{'id': 3013862206, 'id_str': '3013862206', 'ur...",es,El presidente @BArevalodeLeon y la vicepreside...,0,0,0,...,,,,,"<a href=""http://twitter.com/download/android"" ...",http://twitter.com/download/android,Twitter for Android,{'photos': [{'url': 'https://pbs.twimg.com/med...,,snscrape.modules.twitter.Tweet


## 3. Limpieza y Preprocesamiento de Datos

Convertir los JSONs anidados en columnas

In [6]:
nested_columns = ['user', 'retweetedTweet', 'quotedTweet', 'mentionedUsers', 'inReplyToUser', 'media']

In [7]:
def parse_nested_json(x):
    if isinstance(x, str):
        try:
            return json.loads(x.replace("'", '"'))
        except json.JSONDecodeError:
            return None
    else:
        return x

In [8]:
for column in nested_columns:
    if column in df.columns:
        df[column] = df[column].apply(parse_nested_json)

Convertir la columna de fecha a formato datetime

In [9]:
df['date'] = pd.to_datetime(df['date'])

In [10]:
stop_words = set(stopwords.words('spanish'))

Definimos el preprocesamiento de texto

In [11]:
def preprocess_text(text):
    # Convertir el texto a minúsculas
    text = text.lower()

    # Quitar URLs
    text = re.sub(r'http\S+', '', text)

    # Quitar caracteres especiales como `#`, `@` y apóstrofes
    text = re.sub(r'[@#\'’]', '', text)

    # Quitar números
    text = re.sub(r'\d+', '', text)

    # Quitar signos de puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Normalizar caracteres acentuados
    text = unidecode(text)

    # Quitar stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

Aplicar preprocesamiento de texto

In [12]:
df['processed_text'] = df['rawContent'].apply(preprocess_text)

In [13]:
# Extracción de menciones de usuarios
df['mentioned_users'] = df['mentionedUsers'].apply(lambda users: [user['username'] for user in users])

# Identificar si es un retweet o respuesta y las interacciones
df['is_retweet'] = df['retweetedTweet'].apply(lambda x: pd.notnull(x))
df['is_reply'] = df['inReplyToTweetId'].apply(lambda x: pd.notnull(x))

In [14]:
df = df.copy()

# Normalizar nombres de usuario
df.loc[:, 'user_normalized'] = df['user'].apply(lambda x: 
                                                x['username'].lower() 
                                                if isinstance(x, dict) # Si es un diccionario
                                                else x
                                                )

# Normalizar las menciones de usuarios
df.loc[:, 'mentioned_users_normalized'] = df['mentioned_users'].apply(lambda users: 
                                                                      [user.lower() for user in users]
                                                                      )

In [15]:
edges = []

# Crear relaciones de menciones y respuestas
for _, row in df.iterrows():
    if row['mentioned_users_normalized']:
        for mentioned in row['mentioned_users_normalized']:
            edges.append((row['user_normalized'], mentioned, 'mention'))

    if row['is_reply']:
        edges.append((row['user_normalized'], row['inReplyToTweetId'], 'reply'))

edges_df = pd.DataFrame(edges, columns=['source', 'target', 'type'])

In [16]:
G = nx.from_pandas_edgelist(edges_df, source='source', target='target', edge_attr='type', create_using=nx.DiGraph())

In [17]:
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")

Graph has 3831 nodes and 14742 edges
