In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter

import os
from dotenv import load_dotenv

from yt_client.yt_client import YouTubeClient

from googleapiclient.discovery import build
from time import sleep
from pprint import pprint
import re
import spacy
from collections import Counter
from langdetect import detect
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import json

In [None]:
load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

In [None]:
# list of states to scrape 
STATES = [
    "Tamaulipas",
    "Baja California", 
    "Zacatecas", 
    "Colima", 
    "Jalisco"]

In [None]:
PUBLISHED_AFTER = "2022-01-01T00:00:00Z"
PUBLISHED_BEFORE = "2022-12-31T23:59:59Z"

In [None]:
# output folder 
os.makedirs("yt_data", exist_ok=True)

In [None]:
# load models 
nlp_es = spacy.load("es_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# Define custom stop words
custom_stopwords = {
    "que", "yo", "eh", "si", "pa", "x", "xd", "el", "y", "la",
    "the", "i", "and", "you", "this", "to", "is", "it", "of", "in", "on", "for", "me", "my", "do", "at"}

In [None]:
# define the poverty dimensions and their keywords with balanced expressions
poverty_dimensions = {
    "INCOME": [
        "desempleo", "sueldo mínimo", "salario bajo", "inflación", "deudas",
        "crisis", "préstamos", "despidos", "recortes", "quiebra", "pobreza",
        "falta de chamba", "pérdida de empleo", "no hay trabajo", "sin chamba",
        "unemployment", "low wages", "bankrupt", "jobless", "struggling",
        "salario miserable", "jodido", "quedarse sin trabajo",
        "inestabilidad económica", "dificultades financieras", "falta de recursos",
        "sin ingresos suficientes", "incertidumbre laboral", "situación precaria",
        "bajos ingresos", "falta de oportunidades", "dificultad para ahorrar",
        "necesidades básicas insatisfechas", "problemas económicos", "vivir al día",
        "inseguridad laboral", "trabajos temporales", "sin ingreso fijo", 
        "economía familiar reducida", "salario insuficiente", "búsqueda de empleo",
        "sin un peso", "estar en la ruina", "no me alcanza",
        "estar quebrado", "ganar una miseria", "no hay chamba",
        "sueldo de hambre", "vivir con lo que cae", "quedé sin trabajo",
        "buscando trabajo"],
    "ACCESS TO HEALTH SERVICES": [
        "sin medicinas", "hospital lejano", "esperas", "sin seguro", "mala atención",
        "enfermedad", "rechazado", "sin tratamiento", "medicinas caras", "sin doctores",
        "no hay medicinas", "no hay doctores", "centro de salud cerrado", "hospital saturado",
        "no hay atención médica", "healthcare crisis", "expensive medicine", "medical neglect",
        "mal servicio médico", "broncas de salud", "te dejan morir",
        "deficiencia en el sistema de salud", "falta de personal médico", 
        "servicios médicos inaccesibles", "centros médicos distantes", 
        "tratamientos costosos", "falta de atención preventiva", 
        "servicios de urgencias insuficientes", "espera excesiva para atención",
        "falta de especialistas", "medicamentos inaccesibles", "atención médica deficiente",
        "personas sin cobertura médica", "salud pública colapsada", "sistemas de salud precarios",
        "personal médico insuficiente", "instalaciones sanitarias inadecuadas",
        "horas esperando", "no hay ni aspirinas", "no hay ni paracetamol", 
        "hospitales lejanos", "pura negligencia", "no hay citas",
        "citas médicas muy tardías", "salud pública deficiente",  
        "automedicación", "no hay médicos especialistas"],
    "EDUCATIONAL LAG": [
        "sin escuela", "analfabetismo", "deserción", "acceso a la educación",
        "calidad educativa", "recursos didácticos", "infraestructura escolar",
        "escuela lejana", "escuela lejos", "sin útiles", "ausentismo",
        "sin maestros", "rezago escolar", "niños sin clases", "poca educación",
        "school dropout", "no teachers", "poor education", "no school supplies",
        "ni estudian ni trabajan", "escuela en ruinas",
        "falta de acceso a educación", "baja tasa de escolaridad", "educación de baja calidad",
        "jóvenes sin preparación", "falta de oportunidades educativas", "formación académica limitada",
        "problemas de aprendizaje", "abandono escolar temprano", "educación incompleta",
        "infraestructura educativa deficiente", "falta de material didáctico", 
        "educación discontinua", "escuelas en mal estado", "escolaridad interrumpida",
        "recursos pedagógicos insuficientes", "desigualdad educativa",
        "no hay profes", "maestros fantasma", "se caen los techos", 
        "escuelas sin recursos", "no hay clases", "jóvenes sin estudiar", 
        "escuelas en mal estado", "faltan materiales escolares",
        "escuelas sin agua", "difícil acceso a la escuela"],
    "ACCESS TO SOCIAL SECURITY": [
        "sin contrato", "economía informal", "informal", "sin pensión",
        "sin derechos", "explotación", "sin ahorro", "sin prestaciones",
        "desprotección", "trabajo ilegal", "sin seguro", "sin IMSS",
        "chamba sin contrato", "trabajo mal pagado", "explotado", 
        "no benefits", "no retirement", "informal jobs", "unprotected workers",
        "sin aguinaldo", "trabajo en negro",
        "empleo sin protección social", "falta de seguridad laboral", "sin beneficios laborales",
        "carencia de servicios sociales", "trabajadores desprotegidos", "jubilación insuficiente",
        "falta de apoyo institucional", "vulnerabilidad social", "falta de cobertura social",
        "desprotección laboral", "servicios sociales inaccesibles", "sin acceso a ayudas sociales",
        "empleo precario", "trabajadores marginados", "condiciones laborales precarias",
        "empleos de subsistencia", "sin acceso a beneficios sociales",
        "sin papeles", "trabajo por fuera", "trabajo no registrado", 
        "te dan de alta con menos", "trabajo informal", "sin jubilación",
        "condiciones abusivas", "sin beneficios laborales", "sin finiquito",
        "sin protección laboral"],
    "HOUSING": [
        "sin agua", "sin luz", "hacinamiento", "desalojo", "vivienda precaria",
        "sin techo", "goteras", "renta cara", "casa insegura", "sin baño",
        "techos de lámina", "cuartos de cartón", "casas abandonadas", "inundaciones",
        "bad housing", "slum", "no electricity", "unsafe home", "eviction notice",
        "vivir entre ratas", "se les cae la casa",
        "vivienda inadecuada", "condiciones habitacionales deficientes", "falta de servicios básicos",
        "asentamientos irregulares", "déficit habitacional", "falta de acceso a vivienda digna",
        "viviendas en zonas de riesgo", "condiciones insalubres", "viviendas sin servicios esenciales",
        "problemas de habitabilidad", "viviendas improvisadas", "hogares sin infraestructura básica",
        "viviendas en mal estado", "construcciones vulnerables", "viviendas en zonas marginales",
        "imposibilidad de acceder a vivienda", "viviendas no seguras",
        "casa de lámina", "viviendo en la calle", "casas de cartón", "barrio marginal",
        "casa sin drenaje", "colonia sin pavimentar", "renta muy cara",
        "sin agua potable", "se mete el agua cuando llueve", "apagones frecuentes",
        "casa sin ventilación", "techo que gotea"],
    "ACCESS TO FOOD": [
        "hambre", "desnutrición", "comida escasa", "sin alimentos", "comida cara",
        "ayuda alimentaria", "escasez", "comida mala", "dieta pobre", "inseguridad alimentaria",
        "no hay comida", "ni pa' frijoles", "colas para comida", "falta de comida",
        "food insecurity", "starving", "malnutrition", "no food on table",
        "tragando aire", "comer una vez al día",
        "alimentación insuficiente", "nutrición inadecuada", "falta de acceso a alimentos",
        "déficit nutricional", "alimentos de mala calidad", "dieta insuficiente",
        "carencia alimentaria", "pobreza alimentaria", "falta de variedad en la dieta",
        "aumento de precios alimentarios", "falta de alimentos básicos", "crisis alimentaria",
        "problemas de desnutrición", "hambruna", "dependencia de asistencia alimentaria",
        "alimentos inaccesibles", "problemas de alimentación",
        "morirse de hambre", "ni tortillas hay", "comer cada tercer día",
        "no alcanza para comida", "niños con hambre", "sin comida suficiente",
        "no hay ni para huevos", "a puro arroz", "comiendo solo una vez al día",
        "todo muy caro", "precios imposibles"],
    "SOCIAL COHESION": [
        "fragmentación", "polarización", "exclusión", "discriminación", "conflicto",
        "desconfianza", "marginalización", "tensiones", "estigmatización",
        "racismo", "odio de clase", "no hay comunidad", "violencia entre vecinos",
        "division social", "hate speech", "segregation", "marginalized", "resentimiento social",
        "pandillas", "se odian entre barrios",
        "falta de integración social", "ruptura del tejido social", "división comunitaria",
        "falta de solidaridad", "problemas de convivencia", "falta de inclusión",
        "desigualdad social", "sectores sociales aislados", "segregación urbana",
        "comunidades divididas", "aislamiento social", "falta de pertenencia",
        "grupos sociales antagónicos", "conflictos comunitarios", "desintegración social",
        "falta de cohesión", "degradación de relaciones sociales", "fracturas sociales",
        "nadie se habla", "comunidades divididas", "no confiar en nadie", 
        "barrios enemistados", "zonas peligrosas", "sociedades cerradas",
        "discriminación por origen", "separación entre ricos y pobres",
        "barrios conflictivos", "falta de unión", "intolerancia social"]}

In [None]:
# function to pre process comments
def clean_comment(text):
    # Remove links and special characters
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s]", "", text)
    text = text.lower().strip()

    # Detect language (Spanish as default)
    try:
        lang = detect(text)
    except:
        lang = "es"

    nlp = nlp_en if lang == "en" else nlp_es

    # Remove stop words and lemmatize
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct
        and token.lemma_ not in custom_stopwords
        and token.lemma_ != ""
    ]

    return tokens


# function to get comments from a video
def get_video_comments(api_key, video_id, max_comments=400):
    youtube = build("youtube", "v3", developerKey=api_key)
    comments = []
    next_page_token = None

    while len(comments) < max_comments:
        try:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat="plainText"
            ).execute()

            comments += [
                item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                for item in response["items"]
            ]

            next_page_token = response.get("nextPageToken")
            if not next_page_token:
                break

            sleep(0.5)  

        except Exception as e:
            print(f"Error in video {video_id}: {e}")
            break

    return comments


# function to get comments from all videos
def get_all_comments(video_ids, api_key, max_comments_per_video=400):
    all_comments = {}
    for video_id in tqdm(video_ids, desc="Fetching comments"):
        all_comments[video_id] = get_video_comments(api_key, video_id, max_comments_per_video)
    return all_comments


# function to calculate sentiment score 
def get_bert_sentiment(text, tokenizer, model):
    # Return 0 if text is empty or too short
    if not text or len(text) < 2:
        return 0.0
    
    # Convert to string if input is a list
    if isinstance(text, list):
        text = " ".join(text)
    
    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    
    # Get outputs
    with torch.no_grad():
        outputs = model(**inputs)
    
    # The multilingual BERT model outputs value from 1 to 5, we convert it to a scale -1 to 1
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    stars = predicted_class + 1
    sentiment_score = (stars - 3) / 2
    
    return sentiment_score



def get_videos_for_state(state_name, yt_client, published_after, published_before):
    all_videos = []
    
    contexts = [
        f"{state_name} economía",
        f"{state_name} trabajo",
        f"{state_name} empleo",
        f"{state_name} salud",
        f"{state_name} hospital",
        f"{state_name} servicios médicos",
        f"{state_name} médico", 
        f"{state_name} educación",
        f"{state_name} escuela",
        f"{state_name} universidad",
        f"{state_name} estudiantes",
        f"{state_name} seguridad social",
        f"{state_name} beneficios",
        f"{state_name} protección social",
        f"{state_name} seguro social",
        f"{state_name} vivienda",
        f"{state_name} hogar",
        f"{state_name} habitaciónes",
        f"{state_name} servicios básicos",
        f"{state_name} alimentación",
        f"{state_name} comida",
        f"{state_name} alimentos",
        f"{state_name} comunidad",
        f"{state_name} sociedad",
        f"{state_name} integración"] 
    
    for context in contexts:
        context_videos = yt_client.get_videos_by_keyword(
            keyword=context,
            published_after=published_after,
            published_before=published_before,
            limit=100)
        all_videos.extend(context_videos)
    
    # Remove duplicates by video ID
    unique_videos = []
    seen_ids = set()
    for video in all_videos:
        if '_id' in video and 'videoId' in video['_id']:
            video_id = video['_id']['videoId']
            if video_id not in seen_ids:
                unique_videos.append(video)
                seen_ids.add(video_id)
    
    return unique_videos


def analyze_state(state_name, published_after, published_before, yt_client, tokenizer, model):
    print(f"\n{'=' * 50}")
    print(f"Processing state: {state_name}")
    print(f"{'=' * 50}")
    
    # Create directory for EDA results
    eda_dir = os.path.join("yt_data", f"{state_name.lower().replace(' ', '_')}_eda")
    os.makedirs(eda_dir, exist_ok=True)
    
    # Get videos for the state
    print(f"Fetching videos for {state_name}...")
    keyword_videos = get_videos_for_state(
        state_name=state_name,
        yt_client=yt_client, 
        published_after=published_after, 
        published_before=published_before)
    print(f"Found {len(keyword_videos)} videos for {state_name}")
    
    if not keyword_videos:
        print(f"No videos found for {state_name}. Skipping analysis.")
        return None
    
    # Extract video IDs
    video_ids = []
    for video in keyword_videos:
        # Check if the video has the required structure
        if '_id' in video and 'kind' in video['_id'] and video['_id']['kind'] == 'youtube#video':
            if 'videoId' in video['_id']:
                video_ids.append(video['_id']['videoId'])
    
    # Save video IDs for later use
    with open(os.path.join(eda_dir, "video_ids.txt"), "w") as f:
        for video_id in video_ids:
            f.write(f"{video_id}\n")
    
    if not video_ids:
        print(f"No valid video IDs found for {state_name}. Skipping analysis.")
        return None
    
    # Get comments for all videos
    print(f"Fetching comments for {len(video_ids)} videos...")
    state_comments = get_all_comments(video_ids, YT_API_KEY, max_comments_per_video=400)
    
    # Create a list of all comments
    all_comments_raw = []
    for video_id, comments in state_comments.items():
        all_comments_raw.extend(comments)
    
    print(f"Total comments collected: {len(all_comments_raw)}")
    
    # Process the comments for each poverty dimension
    results = {
        "state": [],
        "dimension": [],
        "word_count": [],
        "comments_count": [],
        "avg_sentiment": []}
    
    # Initialize dictionary to store comments by dimension
    comments_by_dimension = {dim: [] for dim in poverty_dimensions.keys()}
    
    print(f"Analyzing comments for each poverty dimension...")
    for dimension, keywords in tqdm(poverty_dimensions.items(), desc="Dimensions"):
        dimension_word_count = 0
        comments_with_dimension = []
        
        # Convert keywords to lowercase for matching
        keywords_lower = [kw.lower() for kw in keywords]
        
        # Analyze each comment
        for comment in all_comments_raw:
            comment_lower = comment.lower()
            
            # Count the number of keywords in the comment
            dimension_keywords_in_comment = 0
            for keyword in keywords_lower:
                count = len(re.findall(r'\b' + re.escape(keyword) + r'\b', comment_lower))
                dimension_word_count += count
                dimension_keywords_in_comment += count
                
            # If the comment contains at least 1 keyword, add it to the list
            if dimension_keywords_in_comment > 0:
                comments_with_dimension.append(comment)
                comments_by_dimension[dimension].append(comment)
        
        # Compute average sentiment for the comments within the dimension
        sentiment_scores = []
        for comment in comments_with_dimension:
            try:
                sentiment = get_bert_sentiment(comment, tokenizer, model)
                sentiment_scores.append(sentiment)
            except Exception as e:
                print(f"Error analyzing sentiment for dimension {dimension}: {e}")
                continue
        
        avg_sentiment = np.mean(sentiment_scores) if sentiment_scores else 0.0
        
        results["state"].append(state_name)
        results["dimension"].append(dimension)
        results["word_count"].append(dimension_word_count)
        results["comments_count"].append(len(comments_with_dimension))
        results["avg_sentiment"].append(avg_sentiment)
    
    # Create a DataFrame with the results
    results_df = pd.DataFrame(results)
    
    # Save the results to CSV
    results_df.to_csv(os.path.join("yt_data", f"{state_name.lower().replace(' ', '_')}_new.csv"), index=False)
    
    # Now perform additional EDA
    print("Performing EDA on collected comments...")
    
    # 1. Keyword Distribution by Poverty Dimension
    plt.figure(figsize=(12, 6))
    # Sort dimensions by word count
    sorted_df = results_df.sort_values('word_count', ascending=False)
    plt.bar(sorted_df['dimension'], sorted_df['word_count'], color='skyblue')
    plt.title(f'Keyword Frequency by Poverty Dimension in {state_name}')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Keyword Count')
    plt.tight_layout()
    plt.savefig(os.path.join(eda_dir, "dimension_keyword_frequency.png"))
    plt.close()
    
    # 2. Comments Count by Dimension
    plt.figure(figsize=(12, 6))
    # Sort dimensions by comments count
    sorted_df = results_df.sort_values('comments_count', ascending=False)
    plt.bar(sorted_df['dimension'], sorted_df['comments_count'], color='lightgreen')
    plt.title(f'Number of Comments Mentioning Each Poverty Dimension in {state_name}')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Comment Count')
    plt.tight_layout()
    plt.savefig(os.path.join(eda_dir, "dimension_comment_count.png"))
    plt.close()
    
    # 3. Sentiment Analysis by Dimension
    plt.figure(figsize=(12, 6))
    # Sort dimensions by sentiment
    sorted_df = results_df.sort_values('avg_sentiment')
    colors = ['red' if s < 0 else 'green' for s in sorted_df['avg_sentiment']]
    plt.bar(sorted_df['dimension'], sorted_df['avg_sentiment'], color=colors)
    plt.title(f'Average Sentiment by Poverty Dimension in {state_name}')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Sentiment Score (-1 to 1)')
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(eda_dir, "dimension_sentiment.png"))
    plt.close()

    # 4. Word Frequency Analysis by Dimension
    dimension_top_words = {}
    spanish_stopwords = set(nlp_es.Defaults.stop_words)
    english_stopwords = set(nlp_en.Defaults.stop_words)
    all_stopwords = spanish_stopwords.union(english_stopwords).union(custom_stopwords)
    
    for dimension, comments_list in comments_by_dimension.items():
        if not comments_list:
            dimension_top_words[dimension] = []
            continue
            
        dimension_text = " ".join(comments_list).lower()
        words = re.findall(r'\b[a-zA-ZáéíóúüñÁÉÍÓÚÜÑ]+\b', dimension_text)
        filtered_words = [word for word in words if word not in all_stopwords and len(word) > 2]
        
        word_freq = Counter(filtered_words)
        dimension_top_words[dimension] = word_freq.most_common(10)
    
    # Save word frequency by dimension to a file
    with open(os.path.join(eda_dir, "dimension_word_freq.txt"), "w", encoding="utf-8") as f:
        for dimension, word_freqs in dimension_top_words.items():
            f.write(f"Top 10 words for {dimension}:\n")
            for word, freq in word_freqs:
                f.write(f"  {word}: {freq}\n")
            f.write("\n")
    
    print(f"EDA completed for {state_name}. Results saved to {eda_dir}")
    
    return results_df

In [None]:
# Main execution
def main():
    
    # Initialize the YouTube client
    yt_client = YouTubeClient(api_key=YT_API_KEY)
    
    # Process each state
    for state in STATES:
        result_df = analyze_state(
            state_name=state,
            published_after=PUBLISHED_AFTER,
            published_before=PUBLISHED_BEFORE,
            yt_client=yt_client,
            tokenizer=tokenizer,
            model=model)
        
        if result_df is not None:
            # Save individual state results
            output_path = os.path.join("yt_data", f"{state.lower()}_new.csv")
            result_df.to_csv(output_path, index=False)
            print(f"Saved results for {state} to {output_path}")

if __name__ == "__main__":
    main()