Question 1:
How has the public perception of anti-vaccine conspiracy via YouTube changed during the last five years?

In [None]:
import sys

!{sys.executable} -m pip install spacy

!{sys.executable} -m spacy download en_core_web_sm

!{sys.executable} -m pip install --upgrade scipy --no-cache-dir

!{sys.executable} -m pip uninstall numpy -y
!{sys.executable} -m pip install numpy==1.24.4 --no-cache-dir

!{sys.executable} pip install pandas==1.5.3 --no-cache-dir


In [None]:
import numpy as np
print(np.__version__)

In [None]:
import pandas as pd
import json
import ast
from collections import Counter
from keybert import KeyBERT
import re


data = pd.read_csv('one.csv')

# The next line cleans the data by removing duplicate lines.
data.drop_duplicates(inplace = True)

data['Published At'] = pd.to_datetime(data['Published At'])

time_periods = [
    ('2020', '2020-01-01', '2020-12-31'),
    ('2021', '2021-01-01', '2021-12-31'),
    ('2022', '2022-01-01', '2022-12-31'),
    ('2023', '2023-01-01', '2023-12-31'),
    ('2024', '2024-01-01', '2024-12-31'),
    ('2025', '2025-01-01', '2025-12-31'),
]
result = {}

def extract_keywords(data):
    model = KeyBERT()
    keywords_list = []

    for item in data:
        text = str(item)
        # Extract single-word keywords
        kw_single = model.extract_keywords(text, keyphrase_ngram_range=(1,1))
        keywords_list.extend(kw_single)

    ignore_words = []

    flattened_keywords = [kw for kw, score in keywords_list if kw.lower() not in ignore_words]

    keywords_df = pd.DataFrame(Counter(flattened_keywords).items(), columns=["words", "numbers"])
    frequent_words = keywords_df.loc[keywords_df['numbers'] > 1].sort_values(by='numbers', ascending=False)
    return frequent_words

for name, start, end in time_periods:
    filter = (data['Published At'] >= start) & (data['Published At'] <= end)
    filtered = data[filter]

    frequent_words = extract_keywords(filtered['TOP 10 comments'].astype(str).tolist())
    result[name] = {
            'keywords': frequent_words.head(15).to_dict(orient="records")
        }

with open('answer_one.json', 'w', encoding='utf-8') as file:
    json.dump(result, file, indent=2)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import json
from collections import defaultdict

with open('answer_one.json', 'r', encoding='utf-8') as file:
    keywords = json.load(file)

frequencies_all_years = defaultdict(int)

for year, content in keywords.items():
    for kw in content['keywords']:
        keyword = kw["words"]
        frequency = int(kw["numbers"])  
        frequencies_all_years[keyword] += frequency

data = pd.DataFrame(frequencies_all_years.items(), columns=['Keyword', 'Frequency'])
data = data.sort_values(by='Frequency', ascending=True)

plt.figure(figsize=(10,8))
plt.barh(data['Keyword'], data['Frequency'], color='green')
plt.xlabel('Frequency')
plt.title(f'Top keywords')
plt.tight_layout()
plt.show()

