## newest: search at state level things such as 'jalisco news' 

In [1]:
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime
from googleapiclient.discovery import build
from time import sleep
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Load environment variables
load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

# Define states and search terms
STATES_SEARCH_TERMS = {
    "Guanajuato": [
        "Guanajuato noticias", 
        "Guanajuato news", 
        "Guanajuato economía"
    ],
    "Michoacán": [
        "Michoacán noticias", 
        "Michoacán new", 
        "Michoacán economía"
    ],
    "Sinaloa": [
        "Sinaloa noticias", 
        "Sinaloa news", 
        "Sinaloa economía"
    ],
    "Chihuahua": [
        "Chihuahua noticias", 
        "Chihuahua news", 
        "Chihuahua economía"
    ],
    "Guerrero": [
        "Guerrero noticias", 
        "Guerrero news", 
        "Guerrero economía"
    ],
    "Tamaulipas": [
        "Tamaulipas noticias", 
        "Tamaulipas news", 
        "Tamaulipas economía"
    ],
    "Baja California": [
        "Baja California noticias", 
        "Baja California news", 
        "Baja California economía"
    ],
    "Zacatecas": [
        "Zacatecas noticias", 
        "Zacatecas new", 
        "Zacatecas economía"
    ],
    "Colima": [
        "Colima noticias", 
        "Colima news", 
        "Colima economía"
    ],
    "Jalisco": [
        "Jalisco noticias", 
        "Jalisco news", 
        "Jalisco economía"]}

# Neutral keyword-based descriptions for poverty dimensions
POVERTY_DIMENSIONS = {
    "INCOME": "empleo trabajo ingreso dinero salario estabilidad ocupación oportunidades",
    "ACCESS TO HEALTH SERVICES": "salud hospital médico medicina tratamiento atención clínica seguro",
    "EDUCATIONAL LAG": "educación escuela maestro estudiante aprendizaje clases universidad formación",
    "ACCESS TO SOCIAL SECURITY": "seguridad social pensión jubilación contrato derechos prestaciones protección laboral",
    "HOUSING": "vivienda casa habitación servicios básicos infraestructura hogar alquiler agua luz",
    "ACCESS TO FOOD": "alimentación comida nutrición alimentos dieta mercado hambre acceso",
    "SOCIAL COHESION": "comunidad inclusión integración participación convivencia respeto diversidad solidaridad"
}

# limits for scraping
MAX_VIDEOS_PER_SEARCH = 100  
MAX_COMMENTS_PER_VIDEO = 300  
API_SLEEP_TIME = 0.5  

class TextProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.dimension_names = list(POVERTY_DIMENSIONS.keys())
        self.dimension_embeddings = self.embedder.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)

    def clean_text(self, text):
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip().lower()

    def classify_dimension(self, text):
        if not text:
            return None, 0.0
        embedding = self.embedder.encode(text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding, self.dimension_embeddings)[0]
        max_idx = torch.argmax(cosine_scores).item()
        return self.dimension_names[max_idx], cosine_scores[max_idx].item()

    def get_sentiment_score(self, text):
        if not text:
            return 0.0
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        stars = torch.argmax(outputs.logits, dim=1).item() + 1
        return (stars - 3) / 2  # Normalize to [-1, 1]

class YouTubeAnalyzer:
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build("youtube", "v3", developerKey=api_key)
        self.processor = TextProcessor()

    def search_videos(self, query, published_after, published_before, max_results=MAX_VIDEOS_PER_SEARCH):
        """Search for videos using a keyword query."""
        videos = []
        next_page_token = None
        
        try:
            while len(videos) < max_results:
                response = self.youtube.search().list(
                    q=query,
                    part="snippet",
                    maxResults=min(50, max_results - len(videos)),  # YouTube API allows max 50 per request
                    pageToken=next_page_token,
                    type="video",
                    order="relevance",
                    publishedAfter=published_after,
                    publishedBefore=published_before,
                    relevanceLanguage="es"
                ).execute()
                
                for item in response.get("items", []):
                    if item["id"]["kind"] == "youtube#video":
                        videos.append({
                            "id": item["id"]["videoId"],
                            "title": item["snippet"]["title"],
                            "description": item["snippet"].get("description", ""),
                            "published_at": item["snippet"]["publishedAt"]
                        })
                
                next_page_token = response.get("nextPageToken")
                if not next_page_token or len(videos) >= max_results:
                    break
                
                sleep(API_SLEEP_TIME)  # Avoid quota exceeded errors
                
        except Exception as e:
            print(f"Error searching for '{query}': {e}")
        
        print(f"Found {len(videos)} videos for query '{query}'")
        return videos

    def get_video_comments(self, video_id, max_comments=MAX_COMMENTS_PER_VIDEO):
        """Get comments for a specific video."""
        comments = []
        next_page_token = None
        
        try:
            while len(comments) < max_comments:
                response = self.youtube.commentThreads().list(
                    part="snippet",
                    videoId=video_id,
                    maxResults=min(100, max_comments - len(comments)),  # YouTube API allows max 100 per request
                    pageToken=next_page_token
                ).execute()
                
                for item in response.get("items", []):
                    comment_text = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                    comments.append(comment_text)
                
                next_page_token = response.get("nextPageToken")
                if not next_page_token or len(comments) >= max_comments:
                    break
                
                sleep(API_SLEEP_TIME)  # Avoid quota exceeded errors
                
        except Exception as e:
            # Many videos have comments disabled, so we'll just pass silently
            pass
        
        return comments

    def analyze_state_by_keywords(self, state_name, search_terms, date_range):
        """Analyze a state by searching for videos using specified search terms."""
        print(f"\nAnalyzing {state_name}...")
        dimension_stats = {dim: {"sentiment_sum": 0.0, "count": 0} for dim in POVERTY_DIMENSIONS}
        total_videos = 0
        total_comments = 0
        
        # Search for videos with each search term
        for search_term in search_terms:
            print(f"  Searching for '{search_term}'...")
            videos = self.search_videos(
                query=search_term,
                published_after=date_range["published_after"],
                published_before=date_range["published_before"],
                max_results=MAX_VIDEOS_PER_SEARCH
            )
            
            if not videos:
                continue
                
            total_videos += len(videos)
            
            # Process videos
            for video in tqdm(videos, desc=f"Processing videos for '{search_term}'"):
                # Get video comments
                comments = self.get_video_comments(video["id"], MAX_COMMENTS_PER_VIDEO)
                total_comments += len(comments)
                
                # Concatenate title, description and comments for analysis
                all_texts = [video["title"] + ". " + video["description"]] + comments
                
                # Analyze each text
                for text in all_texts:
                    clean = self.processor.clean_text(text)
                    if len(clean) < 10:  # Skip very short texts
                        continue
                        
                    dimension, confidence = self.processor.classify_dimension(clean)
                    if confidence > 0.1:  # Only count if confidence is high enough
                        sentiment = self.processor.get_sentiment_score(clean)
                        dimension_stats[dimension]["sentiment_sum"] += sentiment
                        dimension_stats[dimension]["count"] += 1
        
        print(f"  Analyzed {total_videos} videos and {total_comments} comments for {state_name}")
        return dimension_stats, total_videos, total_comments

def analyze_all_states():
    analyzer = YouTubeAnalyzer(YT_API_KEY)
    date_range = {
        "published_after": "2022-01-01T00:00:00Z",
        "published_before": "2022-12-31T23:59:59Z"
    }
    
    # Create directories for results
    os.makedirs("yt_keyword_sentiment", exist_ok=True)
    
    # Store overall stats for summary
    all_results = []
    
    for state, search_terms in STATES_SEARCH_TERMS.items():
        stats, total_videos, total_comments = analyzer.analyze_state_by_keywords(
            state_name=state,
            search_terms=search_terms,
            date_range=date_range
        )
        
        # Create dataframe for this state
        df = pd.DataFrame([
            {
                "state": state,
                "dimension": dim.replace("_", " ").title(),
                "avg_sentiment": v["sentiment_sum"] / v["count"] if v["count"] else 0,
                "mentions_count": v["count"],
                "videos_analyzed": total_videos,
                "comments_analyzed": total_comments
            }
            for dim, v in stats.items()
        ])
        
        # Save state-specific results
        output_file = f"yt_keyword_sentiment/{state.replace(' ', '_').lower()}.csv"
        df.to_csv(output_file, index=False)
        print(f"Saved results to {output_file}")
        
        # Add to overall results
        all_results.append(df)
    
    # Combine all results into one dataframe
    if all_results:
        all_df = pd.concat(all_results)
        all_df.to_csv("yt_keyword_sentiment/all_states_results.csv", index=False)
        print("Saved combined results to yt_keyword_sentiment/all_states_results.csv")
    

if __name__ == "__main__":
    analyze_all_states()




Analyzing Guanajuato...
  Searching for 'Guanajuato noticias'...
Found 100 videos for query 'Guanajuato noticias'


Processing videos for 'Guanajuato noticias': 100%|██████████| 100/100 [04:10<00:00,  2.51s/it]


  Searching for 'Guanajuato news'...
Found 100 videos for query 'Guanajuato news'


Processing videos for 'Guanajuato news': 100%|██████████| 100/100 [12:31<00:00,  7.51s/it]


  Searching for 'Guanajuato economía'...
Found 100 videos for query 'Guanajuato economía'


Processing videos for 'Guanajuato economía': 100%|██████████| 100/100 [01:33<00:00,  1.06it/s]


  Analyzed 300 videos and 18446 comments for Guanajuato
Saved results to yt_keyword_sentiment/guanajuato.csv

Analyzing Michoacán...
  Searching for 'Michoacán noticias'...
Found 100 videos for query 'Michoacán noticias'


Processing videos for 'Michoacán noticias': 100%|██████████| 100/100 [04:26<00:00,  2.67s/it]


  Searching for 'Michoacán new'...
Found 100 videos for query 'Michoacán new'


Processing videos for 'Michoacán new': 100%|██████████| 100/100 [05:49<00:00,  3.50s/it]


  Searching for 'Michoacán economía'...
Found 100 videos for query 'Michoacán economía'


Processing videos for 'Michoacán economía': 100%|██████████| 100/100 [01:24<00:00,  1.18it/s]


  Analyzed 300 videos and 13810 comments for Michoacán
Saved results to yt_keyword_sentiment/michoacán.csv

Analyzing Sinaloa...
  Searching for 'Sinaloa noticias'...
Found 100 videos for query 'Sinaloa noticias'


Processing videos for 'Sinaloa noticias': 100%|██████████| 100/100 [02:06<00:00,  1.26s/it]


  Searching for 'Sinaloa news'...
Found 100 videos for query 'Sinaloa news'


Processing videos for 'Sinaloa news': 100%|██████████| 100/100 [07:14<00:00,  4.34s/it]


  Searching for 'Sinaloa economía'...
Found 100 videos for query 'Sinaloa economía'


Processing videos for 'Sinaloa economía': 100%|██████████| 100/100 [01:32<00:00,  1.08it/s]


  Analyzed 300 videos and 12499 comments for Sinaloa
Saved results to yt_keyword_sentiment/sinaloa.csv

Analyzing Chihuahua...
  Searching for 'Chihuahua noticias'...
Found 100 videos for query 'Chihuahua noticias'


Processing videos for 'Chihuahua noticias': 100%|██████████| 100/100 [02:14<00:00,  1.35s/it]


  Searching for 'Chihuahua news'...
Found 100 videos for query 'Chihuahua news'


Processing videos for 'Chihuahua news': 100%|██████████| 100/100 [02:57<00:00,  1.77s/it]


  Searching for 'Chihuahua economía'...
Found 100 videos for query 'Chihuahua economía'


Processing videos for 'Chihuahua economía': 100%|██████████| 100/100 [06:01<00:00,  3.62s/it]


  Analyzed 300 videos and 10891 comments for Chihuahua
Saved results to yt_keyword_sentiment/chihuahua.csv

Analyzing Guerrero...
  Searching for 'Guerrero noticias'...
Found 100 videos for query 'Guerrero noticias'


Processing videos for 'Guerrero noticias': 100%|██████████| 100/100 [03:07<00:00,  1.87s/it]


  Searching for 'Guerrero news'...
Found 100 videos for query 'Guerrero news'


Processing videos for 'Guerrero news': 100%|██████████| 100/100 [05:14<00:00,  3.14s/it]


  Searching for 'Guerrero economía'...
Found 100 videos for query 'Guerrero economía'


Processing videos for 'Guerrero economía': 100%|██████████| 100/100 [04:10<00:00,  2.50s/it]


  Analyzed 300 videos and 12278 comments for Guerrero
Saved results to yt_keyword_sentiment/guerrero.csv

Analyzing Tamaulipas...
  Searching for 'Tamaulipas noticias'...
Found 100 videos for query 'Tamaulipas noticias'


Processing videos for 'Tamaulipas noticias': 100%|██████████| 100/100 [03:23<00:00,  2.04s/it]


  Searching for 'Tamaulipas news'...
Found 100 videos for query 'Tamaulipas news'


Processing videos for 'Tamaulipas news':  33%|███▎      | 33/100 [01:43<03:29,  3.12s/it]


KeyboardInterrupt: 

## 1) sentiment conditional on the dimension - define dimensions just in a descrptive/neutral way 
Define the 7 dimensions of poverty using neutral words - for instance for income use words such as employment, work, income, money, salary, financial stability, opportunities - and then do the word embedding for them to find comments that talk about the specific dimension. Once all comments are categorized into the corresponding dimension, compute the average sentiment per dimension. 

In this way we avoid any bias as we are just categorizing by dimension of poverty and then computing the sentiment score. 

In [None]:
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime
from googleapiclient.discovery import build
from time import sleep
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load environment variables
load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

# Define states and channel names to search for
STATES_CHANNELS_NAMES = {
    "Guanajuato": [
        {"name": "TV4 Guanajuato"},
        {"name": "Periódico Correo"},
        {"name": "Gobierno de Guanajuato"}
    ],
    "Michoacán": [
        {"name": "CB Televisión"},
        {"name": "Noticias Michoacán"},
        {"name": "Gobierno de Michoacán"}
    ],
    "Sinaloa": [
        {"name": "Noticiero Altavoz"},
        {"name": "TVP Culiacán"},
        {"name": "Gobierno de Sinaloa"}
    ],
    "Chihuahua": [
        {"name": "Canal 28 Chihuahua"},
        {"name": "Noticias de Chihuahua"},
        {"name": "Gobierno de Chihuahua"}
    ],
    "Guerrero": [
        {"name": "Noticiero Acapulco"},
        {"name": "Televisa Acapulco"},
        {"name": "Gobierno de Guerrero"}
    ],
    "Tamaulipas": [
        {"name": "Noticias Tamaulipas"},
        {"name": "Televisa Tamaulipas"},
        {"name": "Gobierno de Tamaulipas"}
    ],
    "Baja California": [
        {"name": "Síntesis TV"},
        {"name": "PSN Televisión"},
        {"name": "Gobierno de Baja California"}
    ],
    "Zacatecas": [
        {"name": "NTR Zacatecas"},
        {"name": "Zacatecas Online"},
        {"name": "Gobierno de Zacatecas"}
    ],
    "Colima": [
        {"name": "AF Medios"},
        {"name": "Colima Noticias"},
        {"name": "Gobierno de Colima"}
    ],
    "Jalisco": [
        {"name": "Canal 44"},
        {"name": "Televisa Guadalajara"},
        {"name": "Gobierno de Jalisco"}]}

# Neutral keyword-based descriptions for poverty dimensions
POVERTY_DIMENSIONS = {
    "INCOME": "empleo trabajo ingreso dinero salario estabilidad ocupación oportunidades",
    "ACCESS TO HEALTH SERVICES": "salud hospital médico medicina tratamiento atención clínica seguro",
    "EDUCATIONAL LAG": "educación escuela maestro estudiante aprendizaje clases universidad formación",
    "ACCESS TO SOCIAL SECURITY": "seguridad social pensión jubilación contrato derechos prestaciones protección laboral",
    "HOUSING": "vivienda casa habitación servicios básicos infraestructura hogar alquiler agua luz",
    "ACCESS TO FOOD": "alimentación comida nutrición alimentos dieta mercado hambre acceso",
    "SOCIAL COHESION": "comunidad inclusión integración participación convivencia respeto diversidad solidaridad"
}

class TextProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        self.tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
        self.dimension_names = list(POVERTY_DIMENSIONS.keys())
        self.dimension_embeddings = self.embedder.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)

    def clean_text(self, text):
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip().lower()

    def classify_dimension(self, text):
        if not text:
            return None, 0.0
        embedding = self.embedder.encode(text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding, self.dimension_embeddings)[0]
        max_idx = torch.argmax(cosine_scores).item()
        return self.dimension_names[max_idx], cosine_scores[max_idx].item()

    def get_sentiment_score(self, text):
        if not text:
            return 0.0
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        stars = torch.argmax(outputs.logits, dim=1).item() + 1
        return (stars - 3) / 2  # Normalize to [-1, 1]

class YouTubeAnalyzer:
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build("youtube", "v3", developerKey=api_key)
        self.processor = TextProcessor()

    def get_channel_id_by_name(self, name, state):
        query = f"{name} {state}"
        response = self.youtube.search().list(q=query, part="id", maxResults=1, type="channel").execute()
        if response['items']:
            return response['items'][0]['id']['channelId']
        return None

    def get_channel_videos(self, channel_id, published_after, published_before):
        videos = []
        uploads_id = self.youtube.channels().list(part="contentDetails", id=channel_id).execute()['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        next_page_token = None
        while True:
            response = self.youtube.playlistItems().list(
                playlistId=uploads_id, part="snippet", maxResults=50, pageToken=next_page_token
            ).execute()
            for item in response['items']:
                published = item['snippet']['publishedAt']
                if published_after <= published <= published_before:
                    videos.append({
                        "id": item['snippet']['resourceId']['videoId'],
                        "title": item['snippet']['title'],
                        "description": item['snippet'].get('description', '')
                    })
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            sleep(0.5)
        return videos

    def get_video_comments(self, video_id):
        comments = []
        next_page_token = None
        while True:
            try:
                response = self.youtube.commentThreads().list(
                    part="snippet", videoId=video_id, maxResults=100, pageToken=next_page_token
                ).execute()
                for item in response.get("items", []):
                    comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
                next_page_token = response.get("nextPageToken")
                if not next_page_token:
                    break
                sleep(0.5)
            except Exception:
                break
        return comments

    def analyze_state(self, state_name, channel_infos, date_range):
        print(f"Analyzing {state_name}...")
        dimension_stats = {dim: {"sentiment_sum": 0.0, "count": 0} for dim in POVERTY_DIMENSIONS}
        for channel in channel_infos:
            channel_id = self.get_channel_id_by_name(channel["name"], state_name)
            if not channel_id:
                continue
            videos = self.get_channel_videos(channel_id, date_range["published_after"], date_range["published_before"])
            for video in videos:
                all_texts = [video["title"] + ". " + video["description"]] + self.get_video_comments(video["id"])
                for text in all_texts:
                    clean = self.processor.clean_text(text)
                    dimension, confidence = self.processor.classify_dimension(clean)
                    if confidence > 0.1:
                        sentiment = self.processor.get_sentiment_score(clean)
                        dimension_stats[dimension]["sentiment_sum"] += sentiment
                        dimension_stats[dimension]["count"] += 1
        return dimension_stats

def analyze_all_states():
    analyzer = YouTubeAnalyzer(YT_API_KEY)
    date_range = {
        "published_after": "2022-01-01T00:00:00Z",
        "published_before": "2022-12-31T23:59:59Z"
    }
    os.makedirs("yt_channels_sentiment", exist_ok=True)
    for state, channels in STATES_CHANNELS_NAMES.items():
        stats = analyzer.analyze_state(state, channels, date_range)
        df = pd.DataFrame([
            {
                "dimension": dim.replace("_", " ").title(),
                "avg_sentiment": v["sentiment_sum"] / v["count"] if v["count"] else 0
            }
            for dim, v in stats.items()
        ])
        df.to_csv(f"yt_channels_sentiment/{state.replace(' ', '_').lower()}.csv", index=False)
        print(f"Saved yt_channels_sentiment/{state.replace(' ', '_').lower()}.csv")

if __name__ == "__main__":
    analyze_all_states()



Analyzing Tamaulipas...
Saved yt_channels_sentiment/tamaulipas.csv
Analyzing Baja California...
Saved yt_channels_sentiment/baja_california.csv
Analyzing Zacatecas...


KeyboardInterrupt: 

## 2) only count of comments related to each dimension of poverty to avoid any bias 

Here we are categorizing by negative attributes related to each poverty dimension - for instance for income we are now using unemployment, economic crisis, low salary, unstable jobs - and then we are just counting the occurences of 'negative' words per dimension.

We avoid the bias since we don't do the sentiment analysis - which would lean towards negative scores as we are filtering for negative things in the first place - but we just count how much each dimension of poverty is discussed. We could assume that, the more a dimension of poverty is discussed, the higher that 'type' of poverty is. 


In [None]:
import pandas as pd
import os
import re
from time import sleep
from dotenv import load_dotenv
from googleapiclient.discovery import build
from sentence_transformers import SentenceTransformer, util
import torch

load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

# Expanded poverty dimensions
POVERTY_DIMENSIONS = {
    "INCOME": "Desempleo, salario bajo, crisis económica, sin ingresos suficientes, trabajos temporales, vivir al día, situación precaria, inflación, deuda, sueldo de hambre, precariedad laboral, no alcanza, buscar trabajo, sin chamba.",
    "ACCESS TO HEALTH SERVICES": "Sin medicinas, hospital lejano, largas esperas, sin seguro médico, mala atención, falta de doctores, centros de salud cerrados, salud pública colapsada, servicios de urgencia deficientes, tratamientos caros, automedicación.",
    "EDUCATIONAL LAG": "Rezago escolar, analfabetismo, sin maestros, abandono escolar, escuelas en mal estado, falta de útiles, deserción, educación de baja calidad, falta de acceso educativo, desigualdad educativa, jóvenes sin estudiar.",
    "ACCESS TO SOCIAL SECURITY": "Trabajo informal, sin contrato, sin prestaciones, sin IMSS, falta de protección laboral, empleo sin derechos, sin jubilación, condiciones precarias, trabajadores explotados, empleo sin seguridad social.",
    "HOUSING": "Vivienda precaria, sin agua o luz, hacinamiento, casa insegura, techos de lámina, casas de cartón, renta cara, falta de drenaje, zonas de riesgo, sin baño, construcciones vulnerables, viviendas abandonadas.",
    "ACCESS TO FOOD": "Inseguridad alimentaria, hambre, comida escasa, sin alimentos básicos, malnutrición, dieta pobre, precios altos, ni para frijoles, dependencia alimentaria, comer una vez al día, alimentos inaccesibles.",
    "SOCIAL COHESION": "Fragmentación social, discriminación, exclusión, desigualdad, tensiones comunitarias, racismo, violencia entre barrios, marginación, falta de integración, odio de clase, polarización social."
}

STATES_CHANNELS_NAMES = {
    "Guanajuato": [
        {"name": "TV4 Guanajuato"},
        {"name": "Periódico Correo"},
        {"name": "Gobierno de Guanajuato"}
    ],
    "Michoacán": [
        {"name": "CB Televisión"},
        {"name": "Noticias Michoacán"},
        {"name": "Gobierno de Michoacán"}
    ],
    "Sinaloa": [
        {"name": "Noticiero Altavoz"},
        {"name": "TVP Culiacán"},
        {"name": "Gobierno de Sinaloa"}
    ],
    "Chihuahua": [
        {"name": "Canal 28 Chihuahua"},
        {"name": "Noticias de Chihuahua"},
        {"name": "Gobierno de Chihuahua"}
    ],
    "Guerrero": [
        {"name": "Noticiero Acapulco"},
        {"name": "Televisa Acapulco"},
        {"name": "Gobierno de Guerrero"}
    ],
    "Tamaulipas": [
        {"name": "Noticias Tamaulipas"},
        {"name": "Televisa Tamaulipas"},
        {"name": "Gobierno de Tamaulipas"}
    ],
    "Baja California": [
        {"name": "Síntesis TV"},
        {"name": "PSN Televisión"},
        {"name": "Gobierno de Baja California"}
    ],
    "Zacatecas": [
        {"name": "NTR Zacatecas"},
        {"name": "Zacatecas Online"},
        {"name": "Gobierno de Zacatecas"}
    ],
    "Colima": [
        {"name": "AF Medios"},
        {"name": "Colima Noticias"},
        {"name": "Gobierno de Colima"}
    ],
    "Jalisco": [
        {"name": "Canal 44"},
        {"name": "Televisa Guadalajara"},
        {"name": "Gobierno de Jalisco"}]}

class TextProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        self.dimensions = list(POVERTY_DIMENSIONS.keys())
        self.embeddings = self.embedder.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)

    def clean(self, text):
        text = re.sub(r"<.*?>", " ", text)
        text = re.sub(r"http\\S+", "", text)
        text = re.sub(r"[^\w\sáéíóúüñÁÉÍÓÚÜÑ]", " ", text)
        return re.sub(r"\s+", " ", text).strip().lower()

    def classify(self, text):
        if not text:
            return None, 0.0
        emb = self.embedder.encode(text, convert_to_tensor=True)
        scores = util.cos_sim(emb, self.embeddings)[0]
        best_idx = torch.argmax(scores).item()
        return self.dimensions[best_idx], scores[best_idx].item()

class YouTubeAnalyzer:
    def __init__(self, key):
        self.youtube = build("youtube", "v3", developerKey=key)
        self.processor = TextProcessor()

    def get_channel_id(self, name, state):
        q = f"{name} {state}"
        res = self.youtube.search().list(q=q, part="id", maxResults=1, type="channel").execute()
        return res["items"][0]["id"]["channelId"] if res["items"] else None

    def get_videos(self, channel_id, after, before):
        vids = []
        uploads_id = self.youtube.channels().list(part="contentDetails", id=channel_id).execute()["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
        token = None
        while True:
            res = self.youtube.playlistItems().list(playlistId=uploads_id, part="snippet", maxResults=50, pageToken=token).execute()
            for item in res["items"]:
                pub = item["snippet"]["publishedAt"]
                if after <= pub <= before:
                    vids.append({
                        "id": item["snippet"]["resourceId"]["videoId"],
                        "title": item["snippet"]["title"],
                        "description": item["snippet"].get("description", "")
                    })
            token = res.get("nextPageToken")
            if not token:
                break
            sleep(0.5)
        return vids

    def get_comments(self, video_id):
        coms = []
        token = None
        while True:
            try:
                res = self.youtube.commentThreads().list(videoId=video_id, part="snippet", maxResults=100, pageToken=token).execute()
                for item in res.get("items", []):
                    coms.append(item["snippet"]["topLevelComment"]["snippet"]["textDisplay"])
                token = res.get("nextPageToken")
                if not token:
                    break
                sleep(0.5)
            except Exception:
                break
        return coms

    def analyze(self, state, channels, drange):
        print(f"\nAnalyzing {state}...")
        counts = {d: 0 for d in POVERTY_DIMENSIONS}
        for ch in channels:
            cid = self.get_channel_id(ch["name"], state)
            if not cid:
                continue
            videos = self.get_videos(cid, drange["after"], drange["before"])
            for v in videos:
                texts = [v["title"] + ". " + v["description"]] + self.get_comments(v["id"])
                for t in texts:
                    dim, conf = self.processor.classify(self.processor.clean(t))
                    if conf > 0.1:
                        counts[dim] += 1
        return counts

def run_analysis():
    yt = YouTubeAnalyzer(YT_API_KEY)
    drange = {"after": "2022-01-01T00:00:00Z", "before": "2022-12-31T23:59:59Z"}
    os.makedirs("yt_channels", exist_ok=True)
    for state, chs in STATES_CHANNELS_NAMES.items():
        results = yt.analyze(state, chs, drange)
        df = pd.DataFrame([{"dimension": k, "comment_count": v} for k, v in results.items()])
        df.to_csv(f"yt_channels/{state.lower().replace(' ', '_')}.csv", index=False)
        print(f"Saved yt_channels/{state.lower().replace(' ', '_')}.csv")

if __name__ == "__main__":
    run_analysis()

## 3) standard approach of filtering for negative words + sentiment score + counts of words 
This could potentially lead to bias results, although I compared results of this approach with results from approach 1 and they are more or less aligned. 

In [None]:
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime
from googleapiclient.discovery import build
from time import sleep
from collections import Counter
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import torch

# Load environment variables
load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

# Define states and channel names to search for
STATES_CHANNELS_NAMES = {
    "Guanajuato": [
        {"name": "TV4 Guanajuato"},
        {"name": "Periódico Correo"},
        {"name": "Gobierno de Guanajuato"}
    ],
    "Michoacán": [
        {"name": "CB Televisión"},
        {"name": "Noticias Michoacán"},
        {"name": "Gobierno de Michoacán"}
    ],
    "Sinaloa": [
        {"name": "Noticiero Altavoz"},
        {"name": "TVP Culiacán"},
        {"name": "Gobierno de Sinaloa"}
    ],
    "Chihuahua": [
        {"name": "Canal 28 Chihuahua"},
        {"name": "Noticias de Chihuahua"},
        {"name": "Gobierno de Chihuahua"}
    ],
    "Guerrero": [
        {"name": "Noticiero Acapulco"},
        {"name": "Televisa Acapulco"},
        {"name": "Gobierno de Guerrero"}
    ],
    "Tamaulipas": [
        {"name": "Noticias Tamaulipas"},
        {"name": "Televisa Tamaulipas"},
        {"name": "Gobierno de Tamaulipas"}
    ],
    "Baja California": [
        {"name": "Síntesis TV"},
        {"name": "PSN Televisión"},
        {"name": "Gobierno de Baja California"}
    ],
    "Zacatecas": [
        {"name": "NTR Zacatecas"},
        {"name": "Zacatecas Online"},
        {"name": "Gobierno de Zacatecas"}
    ],
    "Colima": [
        {"name": "AF Medios"},
        {"name": "Colima Noticias"},
        {"name": "Gobierno de Colima"}
    ],
    "Jalisco": [
        {"name": "Canal 44"},
        {"name": "Televisa Guadalajara"},
        {"name": "Gobierno de Jalisco"}]}

# Spanish descriptions of poverty dimensions
POVERTY_DIMENSIONS = {
    "INGRESOS": "Empleo, salarios, estabilidad financiera, desigualdad de ingresos, oportunidades económicas, seguridad laboral.",
    "ACCESO A SALUD": "Acceso a servicios de salud, calidad médica, medicamentos, seguros, infraestructura hospitalaria.",
    "REZAGO EDUCATIVO": "Acceso a la educación, calidad escolar, alfabetización, abandono escolar, recursos educativos.",
    "SEGURIDAD SOCIAL": "Derechos laborales, protección social, empleo informal, prestaciones, jubilación, seguridad en el empleo.",
    "VIVIENDA": "Calidad de vivienda, acceso a servicios, hacinamiento, asequibilidad, condiciones, desalojos, instalaciones.",
    "ALIMENTACIÓN": "Seguridad alimentaria, hambre, asequibilidad de alimentos, calidad alimentaria, malnutrición, disponibilidad, asistencia.",
    "COHESIÓN SOCIAL": "Integración social, exclusión, marginación, discriminación, confianza, apoyo comunitario."
}

SPANISH_STOPWORDS = ["de", "la", "que", "el", "en"]

class TextProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        self.dimension_names = list(POVERTY_DIMENSIONS.keys())
        self.dimension_embeddings = self.embedder.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)
        self.sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(self.sentiment_model_name)
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(self.sentiment_model_name)

    def clean_text(self, text):
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip().lower()

    def get_sentiment_score(self, text):
        if not text:
            return 0.0
        inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
        stars = torch.argmax(outputs.logits, dim=1).item() + 1
        return (stars - 3) / 2

    def classify_dimension(self, text):
        if not text:
            return None, 0.0
        embedding = self.embedder.encode(text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding, self.dimension_embeddings)[0]
        max_idx = torch.argmax(cosine_scores).item()
        return self.dimension_names[max_idx], cosine_scores[max_idx].item()

class YouTubeAnalyzer:
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build("youtube", "v3", developerKey=api_key)
        self.processor = TextProcessor()

    def get_channel_id_by_name(self, name, state):
        query = f"{name} {state}"
        response = self.youtube.search().list(q=query, part="id", maxResults=1, type="channel").execute()
        if response['items']:
            return response['items'][0]['id']['channelId']
        return None

    def get_channel_videos(self, channel_id, published_after, published_before):
        videos = []
        uploads_id = self.youtube.channels().list(part="contentDetails", id=channel_id).execute()['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        next_page_token = None
        while True:
            response = self.youtube.playlistItems().list(
                playlistId=uploads_id, part="snippet", maxResults=50, pageToken=next_page_token
            ).execute()
            for item in response['items']:
                published = item['snippet']['publishedAt']
                if published_after <= published <= published_before:
                    videos.append({
                        "id": item['snippet']['resourceId']['videoId'],
                        "title": item['snippet']['title'],
                        "description": item['snippet'].get('description', '')
                    })
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            sleep(0.5)
        return videos

    def get_video_comments(self, video_id):
        comments = []
        next_page_token = None
        while True:
            try:
                response = self.youtube.commentThreads().list(
                    part="snippet", videoId=video_id, maxResults=100, pageToken=next_page_token
                ).execute()
                for item in response.get("items", []):
                    comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
                next_page_token = response.get("nextPageToken")
                if not next_page_token:
                    break
                sleep(0.5)
            except Exception:
                break
        return comments

    def analyze_state(self, state_name, channel_infos, date_range):
        print(f"Analyzing {state_name}...")
        dimension_stats = {dim: {"count": 0, "sentiment_sum": 0.0} for dim in POVERTY_DIMENSIONS}
        for channel in channel_infos:
            channel_id = self.get_channel_id_by_name(channel["name"], state_name)
            if not channel_id:
                continue
            videos = self.get_channel_videos(channel_id, date_range["published_after"], date_range["published_before"])
            for video in videos:
                all_texts = [video["title"] + ". " + video["description"]] + self.get_video_comments(video["id"])
                for text in all_texts:
                    clean = self.processor.clean_text(text)
                    sentiment = self.processor.get_sentiment_score(clean)
                    dimension, confidence = self.processor.classify_dimension(clean)
                    if confidence > 0.1:
                        dimension_stats[dimension]["count"] += 1
                        dimension_stats[dimension]["sentiment_sum"] += sentiment
        return dimension_stats


def analyze_all_states():
    analyzer = YouTubeAnalyzer(YT_API_KEY)
    date_range = {
        "published_after": "2022-01-01T00:00:00Z",
        "published_before": "2022-12-31T23:59:59Z"
    }
    os.makedirs("yt_channels", exist_ok=True)
    for state, channels in STATES_CHANNELS_NAMES.items():
        stats = analyzer.analyze_state(state, channels, date_range)
        df = pd.DataFrame([{ "dimension": dim, "comment_count": v["count"], "avg_sentiment": v["sentiment_sum"]/v["count"] if v["count"] else 0 } for dim, v in stats.items()])
        df.to_csv(f"yt_channels/{state.replace(' ', '_').lower()}.csv", index=False)
        print(f"Saved yt_channels/{state.replace(' ', '_').lower()}.csv")

if __name__ == "__main__":
    analyze_all_states()

## 4) same as above but with just more words to define the embedding 
Technically the embedding, since takes the context, should be able to generalize and so shouldn't be necessary to give too many words. But still this might improve the generalization. 

In [None]:
import pandas as pd
import numpy as np
import os
import re
import json
from datetime import datetime
from googleapiclient.discovery import build
from time import sleep
from collections import Counter
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
import torch

# Load environment variables
load_dotenv()
YT_API_KEY = os.getenv("YT_API_KEY")

# Define states and channel names to search for
STATES_CHANNELS_NAMES = {
    "Guanajuato": [
        {"name": "TV4 Guanajuato"},
        {"name": "Periódico Correo"},
        {"name": "Gobierno de Guanajuato"}
    ],
    "Michoacán": [
        {"name": "CB Televisión"},
        {"name": "Noticias Michoacán"},
        {"name": "Gobierno de Michoacán"}
    ],
    "Sinaloa": [
        {"name": "Noticiero Altavoz"},
        {"name": "TVP Culiacán"},
        {"name": "Gobierno de Sinaloa"}
    ],
    "Chihuahua": [
        {"name": "Canal 28 Chihuahua"},
        {"name": "Noticias de Chihuahua"},
        {"name": "Gobierno de Chihuahua"}
    ],
    "Guerrero": [
        {"name": "Noticiero Acapulco"},
        {"name": "Televisa Acapulco"},
        {"name": "Gobierno de Guerrero"}
    ],
    "Tamaulipas": [
        {"name": "Noticias Tamaulipas"},
        {"name": "Televisa Tamaulipas"},
        {"name": "Gobierno de Tamaulipas"}
    ],
    "Baja California": [
        {"name": "Síntesis TV"},
        {"name": "PSN Televisión"},
        {"name": "Gobierno de Baja California"}
    ],
    "Zacatecas": [
        {"name": "NTR Zacatecas"},
        {"name": "Zacatecas Online"},
        {"name": "Gobierno de Zacatecas"}
    ],
    "Colima": [
        {"name": "AF Medios"},
        {"name": "Colima Noticias"},
        {"name": "Gobierno de Colima"}
    ],
    "Jalisco": [
        {"name": "Canal 44"},
        {"name": "Televisa Guadalajara"},
        {"name": "Gobierno de Jalisco"}]}

# Expanded poverty dimensions 
POVERTY_DIMENSIONS = {
    "INCOME": "Desempleo, salario bajo, crisis económica, sin ingresos suficientes, trabajos temporales, vivir al día, situación precaria, inflación, deuda, sueldo de hambre, precariedad laboral, no alcanza, buscar trabajo, sin chamba.",
    "ACCESS TO HEALTH SERVICES": "Sin medicinas, hospital lejano, largas esperas, sin seguro médico, mala atención, falta de doctores, centros de salud cerrados, salud pública colapsada, servicios de urgencia deficientes, tratamientos caros, automedicación.",
    "EDUCATIONAL LAG": "Rezago escolar, analfabetismo, sin maestros, abandono escolar, escuelas en mal estado, falta de útiles, deserción, educación de baja calidad, falta de acceso educativo, desigualdad educativa, jóvenes sin estudiar.",
    "ACCESS TO SOCIAL SECURITY": "Trabajo informal, sin contrato, sin prestaciones, sin IMSS, falta de protección laboral, empleo sin derechos, sin jubilación, condiciones precarias, trabajadores explotados, empleo sin seguridad social.",
    "HOUSING": "Vivienda precaria, sin agua o luz, hacinamiento, casa insegura, techos de lámina, casas de cartón, renta cara, falta de drenaje, zonas de riesgo, sin baño, construcciones vulnerables, viviendas abandonadas.",
    "ACCESS TO FOOD": "Inseguridad alimentaria, hambre, comida escasa, sin alimentos básicos, malnutrición, dieta pobre, precios altos, ni para frijoles, dependencia alimentaria, comer una vez al día, alimentos inaccesibles.",
    "SOCIAL COHESION": "Fragmentación social, discriminación, exclusión, desigualdad, tensiones comunitarias, racismo, violencia entre barrios, marginación, falta de integración, odio de clase, polarización social."
}

SPANISH_STOPWORDS = ["de", "la", "que", "el", "en"]

class TextProcessor:
    def __init__(self):
        self.embedder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
        self.dimension_names = list(POVERTY_DIMENSIONS.keys())
        self.dimension_embeddings = self.embedder.encode(list(POVERTY_DIMENSIONS.values()), convert_to_tensor=True)
        self.sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
        self.sentiment_tokenizer = AutoTokenizer.from_pretrained(self.sentiment_model_name)
        self.sentiment_model = AutoModelForSequenceClassification.from_pretrained(self.sentiment_model_name)

    def clean_text(self, text):
        text = re.sub(r'<.*?>', ' ', text)
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^\w\sáéíóúüñÁÉÍÓÚÜÑ]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip().lower()

    def get_sentiment_score(self, text):
        if not text:
            return 0.0
        inputs = self.sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.sentiment_model(**inputs)
        stars = torch.argmax(outputs.logits, dim=1).item() + 1
        return (stars - 3) / 2

    def classify_dimension(self, text):
        if not text:
            return None, 0.0
        embedding = self.embedder.encode(text, convert_to_tensor=True)
        cosine_scores = util.cos_sim(embedding, self.dimension_embeddings)[0]
        max_idx = torch.argmax(cosine_scores).item()
        return self.dimension_names[max_idx], cosine_scores[max_idx].item()

class YouTubeAnalyzer:
    def __init__(self, api_key):
        self.api_key = api_key
        self.youtube = build("youtube", "v3", developerKey=api_key)
        self.processor = TextProcessor()

    def get_channel_id_by_name(self, name, state):
        query = f"{name} {state}"
        response = self.youtube.search().list(q=query, part="id", maxResults=1, type="channel").execute()
        if response['items']:
            return response['items'][0]['id']['channelId']
        return None

    def get_channel_videos(self, channel_id, published_after, published_before):
        videos = []
        uploads_id = self.youtube.channels().list(part="contentDetails", id=channel_id).execute()['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        next_page_token = None
        while True:
            response = self.youtube.playlistItems().list(
                playlistId=uploads_id, part="snippet", maxResults=50, pageToken=next_page_token
            ).execute()
            for item in response['items']:
                published = item['snippet']['publishedAt']
                if published_after <= published <= published_before:
                    videos.append({
                        "id": item['snippet']['resourceId']['videoId'],
                        "title": item['snippet']['title'],
                        "description": item['snippet'].get('description', '')
                    })
            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            sleep(0.5)
        return videos

    def get_video_comments(self, video_id):
        comments = []
        next_page_token = None
        while True:
            try:
                response = self.youtube.commentThreads().list(
                    part="snippet", videoId=video_id, maxResults=100, pageToken=next_page_token
                ).execute()
                for item in response.get("items", []):
                    comments.append(item['snippet']['topLevelComment']['snippet']['textDisplay'])
                next_page_token = response.get("nextPageToken")
                if not next_page_token:
                    break
                sleep(0.5)
            except Exception:
                break
        return comments

    def analyze_state(self, state_name, channel_infos, date_range):
        print(f"Analyzing {state_name}...")
        dimension_stats = {dim: {"count": 0, "sentiment_sum": 0.0} for dim in POVERTY_DIMENSIONS}
        for channel in channel_infos:
            channel_id = self.get_channel_id_by_name(channel["name"], state_name)
            if not channel_id:
                continue
            videos = self.get_channel_videos(channel_id, date_range["published_after"], date_range["published_before"])
            for video in videos:
                all_texts = [video["title"] + ". " + video["description"]] + self.get_video_comments(video["id"])
                for text in all_texts:
                    clean = self.processor.clean_text(text)
                    sentiment = self.processor.get_sentiment_score(clean)
                    dimension, confidence = self.processor.classify_dimension(clean)
                    if confidence > 0.1:
                        dimension_stats[dimension]["count"] += 1
                        dimension_stats[dimension]["sentiment_sum"] += sentiment
        return dimension_stats


def analyze_all_states():
    analyzer = YouTubeAnalyzer(YT_API_KEY)
    date_range = {
        "published_after": "2022-01-01T00:00:00Z",
        "published_before": "2022-12-31T23:59:59Z"
    }
    os.makedirs("yt_channels", exist_ok=True)
    for state, channels in STATES_CHANNELS_NAMES.items():
        stats = analyzer.analyze_state(state, channels, date_range)
        df = pd.DataFrame([{ "dimension": dim.replace("_", " ").title(), "comment_count": v["count"], "avg_sentiment": v["sentiment_sum"]/v["count"] if v["count"] else 0 } for dim, v in stats.items()])
        df.to_csv(f"yt_channels/{state.replace(' ', '_').lower()}.csv", index=False)
        print(f"Saved yt_channels/{state.replace(' ', '_').lower()}.csv")

if __name__ == "__main__":
    analyze_all_states()
