In [None]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import os
from sentence_transformers import SentenceTransformer, util
import torch
from dotenv import load_dotenv
from mongo_wrapper.mongo_wrapper import MongoWrapper

# states to categorize
STATES = [
    "Guanajuato",
    "Michoacán",
    "Sinaloa",
    "Chihuahua",
    "Guerrero",
    "Tamaulipas",
    "Baja California",
    "Zacatecas",
    "Colima",
    "Jalisco",
    "Aguascalientes",
    "Baja California Sur",
    "Campeche",
    "Coahuila",
    "Durango",
    "Hidalgo",
    "Estado de México",
    "Ciudad de México",
    "Morelos",
    "Nayarit",
    "Nuevo León",
    "Oaxaca",
    "Puebla",
    "Querétaro",
    "Quintana Roo",
    "San Luis Potosí",
    "Sonora",
    "Tabasco",
    "Tlaxcala",
    "Veracruz",
    "Yucatán",
    "Chiapas"]


# channels to analyze 
TARGET_CHANNELS = [
    "elpaismexico",
    "ElUniversalOnline",
    "proceso_unofficial",
    "politicomx",
    "lajornada_unofficial",
    "larazondemexico",
    "sinembargomx",
    "elpaisamerica"]

# define dimensions of poverty 
POVERTY_DIMENSIONS = {
    "INCOME": """
    desempleo, sueldo mínimo, salario bajo, deudas, pobreza, falta de chamba, 
    pérdida de empleo, no hay trabajo, sin chamba, salario miserable, quedarse sin trabajo, 
    sin dinero, poco dinero, ingreso insuficiente, no alcanza, trabajo mal pagado, subempleo, 
    bajos ingresos, sustento familiar, ganar poco, sin ahorros, trabajo informal, 
    trabajo precario, dificultad para pagar 
    """,

    "ACCESS TO HEALTH SERVICES": """
    sin medicinas, hospital lejano, sin seguro, mala atención, enfermedad crónica, 
    rechazado, sin tratamiento, medicinas caras, sin doctores, seguro médico, 
    sistema de salud deficiente, falta de médicos, hospitales saturados, 
    emergencias médicas, clínicas rurales, falta de especialistas, citas médicas, 
    tratamientos costosos, sin acceso a medicamentos, sin posibilidad de tratamiento, 
    colapso hospitalario, saturación médica, falta de camas, desabastecimiento de medicinas,
    atención médica, falta de atención, salud pública, servicio médico malo, 
    no hay doctores, no hay medicina, atención deficiente, centro de salud cerrado, 
    consultorio cerrado, urgencias sin cupo, hospital sin personal, médicos ausentes, 
    morir esperando, negligencia médica, hospital sin insumos, no hay ambulancias
    """,

    "EDUCATIONAL_LAG": """
    sin escuela, analfabetismo, deserción, escuela lejana, sin útiles, ausentismo, 
    sin maestros, rezago escolar, bachillerato incompleto, primaria incompleta, 
    baja escolaridad, educación deficiente, escuelas sin recursos, sin materiales escolares, 
    fracaso escolar, repetir curso, escuelas rurales, transporte escolar, sin computadoras, 
    sin internet, brecha digital, sin educación, alfabetización, estudiantes vulnerables,
    no puede estudiar, abandono escolar, escuela cerrada, maestros faltantes, 
    escuela sin luz, escuela sin agua, escuela insegura, clases suspendidas, 
    educación interrumpida, niños sin clases, jóvenes sin estudio, no hay educación, 
    falta de acceso a la educación, no terminé la escuela, no pude estudiar
    """,

    "ACCESS TO SOCIAL SECURITY": """
    sin contrato, economía informal, sin pensión, sin derechos, sin ahorro, 
    sin prestaciones, desprotección, trabajo ilegal, sin seguro, IMSS, 
    informal job, desprotección social, trabajo sin contrato, empleo informal, 
    sin cotizar, sin jubilación, sistema de pensiones, derechos laborales, 
    protección laboral, trabajo en negro, trabajo sin seguridad social, 
    precariedad laboral, aportes sociales, trabajadores vulnerables
    """,
    
    "HOUSING": """
    vivienda precaria, sin techo, casa insegura, sin baño, techos de lámina, cuartos de cartón, 
    viviendas improvisadas, viviendas informales, vivienda inadecuada, vivienda indigna,
    barrios marginales, asentamientos irregulares, colonias populares, 
    terrenos irregulares, viviendas sin servicios básicos, casas abandonadas, 
    desalojos forzosos, ocupación ilegal, chabolas, tugurios, slum, bad housing
    """,

    "ACCESS TO FOOD": """
    hambre, desnutrición, comida escasa, sin alimentos, comida cara, ayuda alimentaria, 
    dieta pobre, inseguridad alimentaria, canasta básica, acceso a alimentos, nutrición infantil,
    desnutrición crónica, malnutrición, hambruna, bancos de alimentos, comedores sociales,
    comedores populares, comedores comunitarios, programas alimentarios, despensas,
    ayuda alimenticia, costo de alimentos, suministro de alimentos, canasta alimentaria,
    seguridad alimentaria, crisis alimentaria, hambre infantil, falta de comida,
    no hay comida, no alcanza para comer, solo arroz y frijoles, sin cena, sin desayuno,
    comer una vez al día, raciones reducidas, niños con hambre, pasar hambre, 
    no puedo comprar comida, sobrevivir con poco, falta de leche, comida vencida,
    comida donada, filas por comida, buscar comida en la basura, mendigar comida,
    pedir comida, trueque por comida, robar por hambre, sin acceso a comida nutritiva
    """,

    "SOCIAL_COHESION": """
    exclusión social, discriminación, conflicto, desconfianza, marginalización, 
    estigmatización, segregación, grupos vulnerables, minorías, sentido de comunidad, 
    cohesión comunitaria, aislamiento social, marginación, racismo, clasismo, xenofobia, 
    discriminación étnica, discriminación racial, pueblos indígenas, afrodescendientes, 
    migrantes, desplazados, refugiados
    """}


class PovertyDimensionClassifier:
    def __init__(self):
        # sentence embedding
        self.model = SentenceTransformer('hiiamsid/sentence_similarity_spanish_es')
        
        # create embeddings for poverty dimensions
        self.dimension_names = list(POVERTY_DIMENSIONS.keys())
        self.dimension_embeddings = self.model.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)
    
    def clean_text(self, text):
        if not isinstance(text, str):
            return ""
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip().lower()
    
    # classify the text into a poverty dimension using sentence embeddings
    def classify_text(self, text, threshold=0.10):
        if not text:
            return None, 0.0
        
        cleaned_text = self.clean_text(text)
        if len(cleaned_text) < 10:  # avoid too short texts
            return None, 0.0
        
        text_embedding = self.model.encode(cleaned_text, convert_to_tensor=True)
        
        # compute cosine similarity
        cosine_scores = util.cos_sim(text_embedding, self.dimension_embeddings)[0]
        
        # find dimension with highest similarity 
        max_idx = torch.argmax(cosine_scores).item()
        max_score = cosine_scores[max_idx].item()
        if max_score >= threshold:
            return self.dimension_names[max_idx], max_score
        else:
            return None, max_score

# load only data of interest from MongoDB
def load_state_posts():
    
    MONGO_IP = os.getenv("MONGO_IP")
    MONGO_PORT = os.getenv("MONGO_PORT")
    MONGO_DB = os.getenv("MONGO_DB")
    MONGO_USERNAME = os.getenv("MONGO_USERNAME")
    MONGO_PASSWORD = os.getenv("MONGO_PASSWORD")
    
    mongo_client = MongoWrapper(
        db=MONGO_DB,
        user=MONGO_USERNAME,
        password=MONGO_PASSWORD,
        ip=MONGO_IP,
        port=MONGO_PORT)
    
    all_channels = mongo_client.get_all_collections()
    available_target_channels = [channel for channel in TARGET_CHANNELS if channel in all_channels]
    
    # initialize a dictionary to store posts for each state
    state_posts = {state: [] for state in STATES}
    
    # classify posts by using regex patterns - if a post contains a state name, it will be classified as that state
    state_patterns = {state: re.compile(r'\b' + re.escape(state) + r'\b', re.IGNORECASE) for state in STATES}
    
    for channel in tqdm(available_target_channels, desc="loading channels"):
        posts = mongo_client.get_collection_entries(collection=channel)
        
        print(f"channel: {channel} - {len(posts)} post found")
        
        for post in tqdm(posts, desc=f"analysis {channel}", leave=False):
            post_text = post.get('text', '')
                
            for state, pattern in state_patterns.items():
                if pattern.search(post_text):
                    state_posts[state].append({
                        'text': post_text,
                        'author': post.get('author', ''),
                        'posting_ts': post.get('posting_ts', ''),
                        'channel': channel})
    
    for state in STATES:
        state_posts[state] = pd.DataFrame(state_posts[state])
    
    return state_posts

# classify posts by dimensions of poverty 
def analyze_poverty_dimensions(state_posts):
    classifier = PovertyDimensionClassifier()
    
    results = []

    for state, df in state_posts.items():
        print(f"\nanalyzing {state} ({len(df)} posts)...")
    
        dimension_counts = {dim: 0 for dim in POVERTY_DIMENSIONS.keys()}
        dimension_counts["OTHER"] = 0  # other types of posts 
    
        for idx, row in tqdm(df.iterrows(), total=len(df), desc=f"Classifying {state}"):
            text = row['text']
        
            dimension, score = classifier.classify_text(text)
        
            if dimension:
                dimension_counts[dimension] += 1
            else:
                dimension_counts["OTHER"] += 1
        
        total_posts = len(df)
        dimension_percentages = {dim: (count / total_posts) * 100 for dim, count in dimension_counts.items()}
        
        print(f"\nresults for {state}:")
        print(f"total posts: {total_posts}")
        print("\ndistribution of posts across dimensions of poverty:")
        
        for dim, count in dimension_counts.items():
            dim_name = dim if dim != "OTHER" else "non-poverty posts"
            pct = dimension_percentages[dim]
            print(f"- {dim_name}: {count} post ({pct:.1f}%)")
        
        for dim in list(POVERTY_DIMENSIONS.keys()) + ["OTHER"]:
            results.append({
                'state': state,
                'dimension': dim,
                'count': dimension_counts[dim],
                'percentage': dimension_percentages[dim],
                'total_posts': total_posts})
    
    results_df = pd.DataFrame(results)
    return results_df

def main():
    state_posts = load_state_posts()
    
    results = analyze_poverty_dimensions(state_posts)
    
    results.to_csv("tg_results_reduced.csv", index=False)
    
    pivot_counts = results.pivot(index='state', columns='dimension', values='count')
    print("\ncount of posts per dimension:")
    print(pivot_counts)
    
    pivot_percentages = results.pivot(index='state', columns='dimension', values='percentage')
    print("\npercentage of posts per dimension:")
    print(pivot_percentages.round(1))

if __name__ == "__main__":
    main()

2025-05-12 09:00:37,671 INFO Connected to thesis database on 206.81.16.39
loading channels:   0%|          | 0/8 [00:00<?, ?it/s]

channel: elpaismexico - 1750 post found


loading channels:  12%|█▎        | 1/8 [00:00<00:04,  1.67it/s]

channel: ElUniversalOnline - 2435 post found


loading channels:  25%|██▌       | 2/8 [00:02<00:06,  1.13s/it]

channel: proceso_unofficial - 3141 post found


loading channels:  38%|███▊      | 3/8 [00:02<00:03,  1.33it/s]

channel: politicomx - 5103 post found


loading channels:  50%|█████     | 4/8 [00:02<00:02,  1.67it/s]

channel: lajornada_unofficial - 18673 post found


loading channels:  62%|██████▎   | 5/8 [00:03<00:02,  1.32it/s]

channel: larazondemexico - 4248 post found


loading channels:  75%|███████▌  | 6/8 [00:04<00:01,  1.20it/s]

channel: sinembargomx - 9525 post found


loading channels:  88%|████████▊ | 7/8 [00:05<00:00,  1.12it/s]

channel: elpaisamerica - 1411 post found


loading channels: 100%|██████████| 8/8 [00:06<00:00,  1.28it/s]



analyzing Guanajuato (219 posts)...


Classifying Guanajuato: 100%|██████████| 219/219 [00:21<00:00, 10.10it/s]



results for Guanajuato:
total posts: 219

distribution of posts across dimensions of poverty:
- INCOME: 91 post (41.6%)
- ACCESS TO HEALTH SERVICES: 3 post (1.4%)
- EDUCATIONAL_LAG: 7 post (3.2%)
- ACCESS TO SOCIAL SECURITY: 60 post (27.4%)
- HOUSING: 50 post (22.8%)
- ACCESS TO FOOD: 1 post (0.5%)
- SOCIAL_COHESION: 1 post (0.5%)
- non-poverty posts: 6 post (2.7%)

analyzing Michoacán (317 posts)...


Classifying Michoacán: 100%|██████████| 317/317 [00:33<00:00,  9.48it/s]



results for Michoacán:
total posts: 317

distribution of posts across dimensions of poverty:
- INCOME: 72 post (22.7%)
- ACCESS TO HEALTH SERVICES: 23 post (7.3%)
- EDUCATIONAL_LAG: 15 post (4.7%)
- ACCESS TO SOCIAL SECURITY: 130 post (41.0%)
- HOUSING: 59 post (18.6%)
- ACCESS TO FOOD: 3 post (0.9%)
- SOCIAL_COHESION: 6 post (1.9%)
- non-poverty posts: 9 post (2.8%)

analyzing Sinaloa (265 posts)...


Classifying Sinaloa: 100%|██████████| 265/265 [00:40<00:00,  6.59it/s]



results for Sinaloa:
total posts: 265

distribution of posts across dimensions of poverty:
- INCOME: 95 post (35.8%)
- ACCESS TO HEALTH SERVICES: 9 post (3.4%)
- EDUCATIONAL_LAG: 11 post (4.2%)
- ACCESS TO SOCIAL SECURITY: 134 post (50.6%)
- HOUSING: 8 post (3.0%)
- ACCESS TO FOOD: 3 post (1.1%)
- SOCIAL_COHESION: 2 post (0.8%)
- non-poverty posts: 3 post (1.1%)

analyzing Chihuahua (230 posts)...


Classifying Chihuahua: 100%|██████████| 230/230 [00:29<00:00,  7.70it/s]



results for Chihuahua:
total posts: 230

distribution of posts across dimensions of poverty:
- INCOME: 48 post (20.9%)
- ACCESS TO HEALTH SERVICES: 3 post (1.3%)
- EDUCATIONAL_LAG: 35 post (15.2%)
- ACCESS TO SOCIAL SECURITY: 98 post (42.6%)
- HOUSING: 27 post (11.7%)
- ACCESS TO FOOD: 5 post (2.2%)
- SOCIAL_COHESION: 4 post (1.7%)
- non-poverty posts: 10 post (4.3%)

analyzing Guerrero (343 posts)...


Classifying Guerrero:  13%|█▎        | 44/343 [00:12<01:24,  3.53it/s]


KeyboardInterrupt: 