In [1]:
from confluent_kafka import Consumer, KafkaError
import socket
import json
import pandas as pd
from datetime import datetime, timedelta

BROKER = 'localhost:9092'
GROUP_ID = 'analytics'
TOPIC = 'youtube_topic'

# change to path for players_22.csv
PLAYER_DATASET_PATH = 'Project FIFA Dataset/players_22.csv'

players_df = pd.read_csv(PLAYER_DATASET_PATH)


player_names = []
name_to_long_name = {}
for idx, row in players_df.iterrows():
    # .strip for whitespace
    short_name = row['short_name'].lower().strip()
    long_name = row['long_name'].lower().strip()
    player_names.append(short_name)
    player_names.append(long_name)
    name_to_long_name[short_name] = row['long_name']
    name_to_long_name[long_name] = row['long_name']

def find_player_mentions(text):
    text_lower = text.lower()
    words = text_lower.split()
    mentioned_players = []
    for player_name in player_names:
        if player_name in words:
            mentioned_players.append(player_name)
    return mentioned_players

def create_kafka_consumer(broker, group_id, topic):
    conf = {
        'bootstrap.servers': broker,
        'group.id': group_id,
        'auto.offset.reset': 'earliest',
        'client.id': socket.gethostname()
    }
    consumer = Consumer(conf)
    consumer.subscribe([topic])
    return consumer

player_mentions = {}
all_comments = []

consumer = create_kafka_consumer(BROKER, GROUP_ID, TOPIC)
start_time = datetime.now()
end_time = start_time + timedelta(minutes=5)

while datetime.now() < end_time:
    msg = consumer.poll(timeout=1.0)
    if msg is None:
        continue
    if msg.error():
        if msg.error().code() == KafkaError._PARTITION_EOF:
            continue
        break
    
    key = msg.key().decode('utf-8')
    value = msg.value().decode('utf-8')
    comment_data = json.loads(value)
    comment_text = comment_data.get('text', '')

    all_comments.append(comment_data)
    mentioned_players = find_player_mentions(comment_text)
    for player_name in mentioned_players:
        if player_name in player_mentions:
            player_mentions[player_name] += 1
        else:
            player_mentions[player_name] = 1

consumer.close()

sorted_players = sorted(player_mentions.items(), key=lambda x: x[1], reverse=True)

print("Most Popular Player in YouTube Comments")
print("=" * 39)
if len(sorted_players) > 0:
    most_popular_name = sorted_players[0][0]
    most_popular_count = sorted_players[0][1]
    
    # Get the full name (long_name) from the mapping
    full_name = name_to_long_name.get(most_popular_name, most_popular_name.title())
    
    print(f"Player: {full_name}")
    print(f"Total Mentions: {most_popular_count}")
else:
    print("No player mentions found.")

# dump all comments into a json file
with open('youtube_comments_dump.json', 'w') as f:
    json.dump(all_comments, f)

  players_df = pd.read_csv(PLAYER_DATASET_PATH)


Most Popular Player in YouTube Comments
Player: Rodrigo Sánchez Rodríguez
Total Mentions: 12
