In [None]:
import requests
import csv
from datetime import datetime
import time

API_KEY = 'YOUR_YOUTUBE_API_KEY_HERE'

SEARCH_QUERIES = [
    "polusi udara jakarta", "kualitas udara jakarta", "cuaca jakarta",
    "banjir jakarta", "debu jakarta", "asap jakarta", "kabut jakarta",
    "hujan jakarta", "banjir jkt", "polusi jkt"
]

VIDEO_IDS_MANUAL = [
    "Xmd_6ZXl6lI","imlCPA5Vu2c","OGw2Jhu4KlU","fZU9Q1_GHLE",
    "kJsBTQkZV0M","Mp4hDSnpjQ4","_41KtEUiZT8","E4nuMUg96aM",
    "frPoXlkkOXM","zdhm5OuH5Qs","52BE41TcHu4","bnb8RHwortQ"
]

COMMENT_KEYWORDS = [
    "banjir","cuaca","polusi","polusi udara","debu","asap","kabut",
    "kualitas udara","hujan","angin","kotor","pabrik","abu","jkt",
    "jakarta","sungai"
]

COMMENT_KEYWORDS = [k.lower() for k in COMMENT_KEYWORDS]


def safe_get(url, params, max_retries=3, backoff=2):
    for attempt in range(max_retries):
        try:
            r = requests.get(url, params=params, timeout=20)
            if r.status_code == 200:
                return r.json()
            else:
                print(f"HTTP {r.status_code} - {r.text[:200]}")
                # if quota or forbidden, break early
                if r.status_code in (403, 400):
                    return {"error": f"HTTP {r.status_code}"}
        except Exception as e:
            print("Request error:", e)
        time.sleep(backoff * (attempt + 1))
    return {"error": "max retries"}

def search_videos(query, max_results=25):
    url = "https://www.googleapis.com/youtube/v3/search"
    params = {
        "key": API_KEY,
        "q": query,
        "part": "id",
        "maxResults": max_results,
        "type": "video"
    }
    data = safe_get(url, params)
    ids = []
    if "items" in data:
        for it in data["items"]:
            vid = it.get("id", {}).get("videoId")
            if vid:
                ids.append(vid)
    else:
        print("Search error or no items for query:", query, data.get("error"))
    return ids

def scrape_comments_for_video(video_id):
    url = "https://www.googleapis.com/youtube/v3/commentThreads"
    results = []
    page_token = None
    checked = 0
    matched = 0

    while True:
        params = {
            "key": API_KEY,
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,
            "textFormat": "plainText"
        }
        if page_token:
            params["pageToken"] = page_token

        data = safe_get(url, params)
        if "error" in data:
            print("API error while fetching comments for", video_id, data.get("error"))
            break

        items = data.get("items", [])
        if not items and page_token is None:
            print(f"-> No commentThreads returned for video {video_id} (comments may be disabled or none).")
            break

        for it in items:
            try:
                c = it["snippet"]["topLevelComment"]["snippet"]
            except Exception:
                continue
            checked += 1
            text = c.get("textDisplay", "")
            text_l = text.lower()
            published = c.get("publishedAt")

            dt = None
            if published:
                try:
                    dt = datetime.fromisoformat(published.replace("Z", "+00:00"))
                except Exception:
                    try:
                        dt = datetime.strptime(published, "%Y-%m-%dT%H:%M:%S.%fZ")
                    except Exception:
                        dt = None

            if dt and dt.year == 2023 and any(kw in text_l for kw in COMMENT_KEYWORDS):
                matched += 1
                results.append({
                    "tanggal": dt.strftime("%Y-%m-%d"),
                    "komentar": text,
                    "video_id": video_id
                })

        page_token = data.get("nextPageToken")
        if not page_token:
            break

    return results, checked, matched

video_ids = set(VIDEO_IDS_MANUAL)

print("Searching videos from queries...")
for q in SEARCH_QUERIES:
    found = search_videos(q, max_results=25)
    for v in found:
        video_ids.add(v)

video_ids = list(video_ids)
print("Total videos to check:", len(video_ids))

all_rows = []
summary = []

for vid in video_ids:
    print("\n--- Processing video:", vid)
    rows, checked, matched = scrape_comments_for_video(vid)
    print(f"Checked comments: {checked}, Matched: {matched} for video {vid}")
    all_rows.extend(rows)
    summary.append({"video_id": vid, "checked": checked, "matched": matched})

outfile = "data/komentar_jakarta_2023_debug.csv"
if all_rows:
    with open(outfile, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=["tanggal", "komentar", "video_id"])
        writer.writeheader()
        writer.writerows(all_rows)
    print(f"\nSelesai. Disimpan: {outfile}. Total matched rows: {len(all_rows)}")
else:
    print("\nTidak ditemukan komentar yang cocok di semua video (hasil matched = 0).")
    print("Ringkasan per video (checked, matched):")
    for s in summary:
        print(f" - {s['video_id']}: checked={s['checked']}, matched={s['matched']}")


SCRAPING COMENTAR YOUTUBE TENTANG RESPON PUBLIK TERHADAP CUACA DAN KUALITAS UDARA

Cleaning hasil scraping sentimen 

In [None]:
import pandas as pd
import re

data = pd.read_csv("data/komentar_jakarta_2023_debug.csv")

data = data.dropna(how="all")

def bersihkan_komentar(teks):
    if pd.isna(teks):
        return ""
    teks = str(teks).strip()

    teks = re.sub(r"[^a-zA-Z0-9.,!?()/%\- ]+", " ", teks)

    teks = re.sub(r"\s+", " ", teks)

    return teks.strip()


data["komentar"] = data["komentar"].apply(bersihkan_komentar)

data = data[data["komentar"] != ""]

data = data.drop_duplicates(subset=["tanggal", "komentar"])

kelompok_tanggal = data.groupby("tanggal")["komentar"].apply(list).reset_index()


maksimal_isi = kelompok_tanggal["komentar"].apply(len).max()

for i in range(maksimal_isi):
    kelompok_tanggal[f"sentimen_{i+1}"] = kelompok_tanggal["komentar"].apply(
        lambda daftar: daftar[i] if i < len(daftar) else ""
    )

kelompok_tanggal = kelompok_tanggal.drop(columns=["komentar"])
display(kelompok_tanggal.head())


kelompok_tanggal.to_csv("data_clean/data_sentimen_bersih.csv", index=False)


CLEANING CUACA

In [None]:
import pandas as pd

df = pd.read_excel('data/dataset_prediksi_cuaca.xlsx')

df.columns = df.columns.str.strip()

df = df.applymap(lambda x: str(x).strip() if isinstance(x, str) else x)
df.drop(columns=['Lokasi'], inplace=True)

kolom_numerik = [
    'Suhu Maks (deg of C)', 'Suhu Min (deg of C)', 'Kelembaban (%)',
    'Kecepatan Angin (km/jam)', 'Arah Angin (deg)', 'Tekanan Udara (hPa)',
    'Tutupan Awan (%)', 'Curah Hujan Hari Ini (mm)', 'Curah Hujan Besok (mm)'
]

mapping_cuaca = {
    'Cerah Berawan':2,
    'Hujan Ringan':1,
    'Cerah':0,
    'Hujan Sedang':-1,
    'Hujan Lebat':-2
}


for kolom in kolom_numerik:
    df[kolom] = df[kolom].astype(str).str.replace(',', '.', regex=False)
    df[kolom] = pd.to_numeric(df[kolom], errors='coerce')
    df[kolom] = df[kolom].interpolate(method='linear')
    df[kolom] = df[kolom].fillna(method='bfill').fillna(method='ffill')

df['Tanggal'] = pd.to_datetime(df['Tanggal'], errors='coerce')
df_2023 = df[df['Tanggal'].dt.year == 2023]
df_2023 = df_2023.drop_duplicates()
df_2023 = df_2023.dropna(subset=['Tanggal'])


df_2023['Cuaca Hari Ini'] = pd.to_numeric(df_2023['Cuaca Hari Ini'].replace(mapping_cuaca))
df_2023['Cuaca Besok'] = pd.to_numeric(df_2023['Cuaca Besok'].replace(mapping_cuaca))
df_2023.reset_index(drop=True, inplace=True)

df_2023.to_csv("data_clean/data_cuaca_jakarta_2023_bersih.csv", index=False)

display(df_2023.head())




CLEANING DATA KUALITAS UDARA

In [None]:
import pandas as pd
import re

data = pd.read_csv("data/ispu_dki4.csv")

data.columns = data.columns.str.strip().str.lower()

mapping_udara = {
'TIDAK ADA DATA':0,
'SANGAT TIDAK SEHAT':0,
'TIDAK SEHAT':1,
'SEDANG':2,
'BAIK':3
}
data.drop(columns=['stasiun', 'critical', 'max'], inplace=True)


kolom_angka = ["pm25","pm10","so2","co","o3","no2"]

for kolom in kolom_angka:
    data[kolom] = (
        data[kolom]
        .astype(str)
        .str.replace(",", ".", regex=False)
        .str.strip()
    )
    data[kolom] = pd.to_numeric(data[kolom], errors="coerce")
    
    
    data[kolom] = data[kolom].interpolate(method='linear')
    data[kolom] = data[kolom].fillna(method='bfill').fillna(method='ffill')

data["tanggal"] = pd.to_datetime(data["tanggal"], errors="coerce")
data = data[data['tanggal'].dt.year == 2023]
data.reset_index(drop=True, inplace=True)

data['categori'] = pd.to_numeric(data['categori'].replace(mapping_udara))

data = data.dropna(subset=["tanggal"])
data = data.drop_duplicates()

display(data.head())
data.to_csv("data_clean/kualitas_udara_jakarta_2023_bersih.csv", index=False)



INTEGRASI BERDASARKAN TANGGAL

In [None]:
import pandas as pd

data_sentimen = pd.read_csv("data_clean/data_sentimen_bersih.csv")
data_cuaca = pd.read_csv("data_clean/data_cuaca_jakarta_2023_bersih.csv")
data_udara = pd.read_csv("data_clean/kualitas_udara_jakarta_2023_bersih.csv")


data_sentimen["tanggal"] = pd.to_datetime(data_sentimen["tanggal"], errors="coerce")

data_cuaca["tanggal"] = pd.to_datetime(data_cuaca["Tanggal"], errors="coerce")

data_udara["tanggal"] = pd.to_datetime(data_udara["tanggal"], errors="coerce")

data_cuaca = data_cuaca.drop(columns=["Tanggal"])

gabung_cuaca_udara = pd.merge(
    data_cuaca,
    data_udara,
    on="tanggal",
    how="left"
)

gabungan_akhir = pd.merge(
    gabung_cuaca_udara,
    data_sentimen,
    on="tanggal",
    how="left"
)


if "tanggal" not in gabungan_akhir.columns:
    raise ValueError("Kolom 'tanggal' hilang. Periksa merge.")

gabungan_akhir = gabungan_akhir.sort_values("tanggal")

kolom_akhir = ["tanggal"] + [col for col in gabungan_akhir.columns if col != "tanggal"]
gabungan_akhir = gabungan_akhir[kolom_akhir]

sentimen_cols = [col for col in gabungan_akhir.columns if col.startswith("sentimen_")]
gabungan_akhir["total_sentimen"] = gabungan_akhir[sentimen_cols].count(axis=1)  

for kol in ["unnamed: 0", "Unnamed: 0"]:
    if kol in gabungan_akhir.columns:
        gabungan_akhir = gabungan_akhir.drop(columns=[kol])

gabungan_akhir.to_csv("data_clean/integrasi_cuaca_udara_sentimen_2023.csv", index=False)


display(gabungan_akhir.head())

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


print("\nJumlah baris:", len(gabungan_akhir))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import matplotlib.dates as mdates



df = pd.read_csv('integrasi_cuaca_udara_sentimen_2023.csv')

numeric_cols = [
    'Suhu Maks (deg of C)',
    'Suhu Min (deg of C)',
    'Kelembaban (%)',
    'Kecepatan Angin (km/jam)',
    'Tekanan Udara (hPa)',
    'Tutupan Awan (%)',
    'Curah Hujan Hari Ini (mm)',
    'pm25', 'pm10', 'so2', 'co', 'o3', 'no2',
    'total_sentimen'
]



display(df[numeric_cols].describe())

df[numeric_cols].hist(bins=20, figsize=(15, 12))
plt.suptitle('Distribusi Variabel Numerik')
plt.tight_layout()
plt.show()




plt.figure(figsize=(14, 6))

suhu_maks_norm = (df['Suhu Maks (deg of C)'] - df['Suhu Maks (deg of C)'].min()) / (df['Suhu Maks (deg of C)'].max() - df['Suhu Maks (deg of C)'].min())
pm25_norm = (df['pm25'] - df['pm25'].min()) / (df['pm25'].max() - df['pm25'].min())
total_sentimen_norm = (df['total_sentimen'] - df['total_sentimen'].min()) / (df['total_sentimen'].max() - df['total_sentimen'].min())

plt.plot(df['tanggal'], suhu_maks_norm, label='Suhu Maks (normalized)', alpha=0.7)
plt.plot(df['tanggal'], pm25_norm, label='PM2.5 (normalized)', alpha=0.7)
plt.plot(df['tanggal'], total_sentimen_norm, label='Total Sentimen/Komentar (normalized)', alpha=0.7)
plt.title('Tren Harian: Suhu Maks, PM2.5, dan Volume Komentar')
plt.xlabel('Tanggal')
plt.ylabel('Nilai')
plt.legend()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # tiap minggu
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def detect_outlier_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[col] < lower_bound) | (df[col] > upper_bound)]


outlier_suhu = detect_outlier_iqr(df, 'Suhu Maks (deg of C)')
print("Outlier Suhu Maks:")
display(outlier_suhu[['tanggal', 'Suhu Maks (deg of C)']])


outlier_hujan = detect_outlier_iqr(df, 'Curah Hujan Hari Ini (mm)')
print("Outlier Curah Hujan:")
display(outlier_hujan[['tanggal', 'Curah Hujan Hari Ini (mm)']])


outlier_pm25 = detect_outlier_iqr(df, 'pm25')
print("Outlier PM2.5:")
display(outlier_pm25[['tanggal', 'pm25']])


outlier_sentimen = detect_outlier_iqr(df, 'total_sentimen')
print("Outlier Komentar/Sentimen:")
display(outlier_sentimen[['tanggal', 'total_sentimen']])

anomaly_merge = outlier_sentimen.merge(
    df[['tanggal', 'Suhu Maks (deg of C)', 'Curah Hujan Hari Ini (mm)', 'pm25']],
    on='tanggal', how='left'
)
print("Merge Outlier Sentimen dan Info Lingkungan:")
display(anomaly_merge)

plt.figure(figsize=(15, 6))
plt.plot(df['tanggal'], df['total_sentimen'], label='Total Sentimen')
plt.scatter(outlier_sentimen['tanggal'], outlier_sentimen['total_sentimen'], color='red', label='Outlier Sentimen')
plt.title('Ledakan Volume Komentar (Outlier)')
plt.ylabel('Total Komentar/Sentimen')
plt.xlabel('Tanggal')
plt.legend()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # tiap minggu
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14,6))

sentimen_norm = (df['total_sentimen'] - df['total_sentimen'].min()) / (df['total_sentimen'].max() - df['total_sentimen'].min())
pm25_norm = (df['pm25'] - df['pm25'].min()) / (df['pm25'].max() - df['pm25'].min())

plt.plot(df['tanggal'], sentimen_norm, label='Volume Sentimen (normalized)')
plt.plot(df['tanggal'], pm25_norm, label='PM2.5 (normalized)', alpha=0.7)
plt.title('Tren Volume Sentimen dan PM2.5 Harian (Normalized)')
plt.xlabel('Tanggal')
plt.ylabel('Nilai Normalized (0-1)')
plt.legend()
ax = plt.gca()
ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))  # tiap minggu
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


df_final = pd.read_csv('integrasi_cuaca_udara_sentimen_2023.csv')  # Ganti dengan nama file aslinya


kolom_numerik = df_final.select_dtypes(include="number")
corr = kolom_numerik.corr()


plt.figure(figsize=(14, 12))
plt.imshow(corr, cmap="viridis")
plt.colorbar()

plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.title("Heatmap Korelasi dengan Nilai Angka")

for i in range(len(corr.columns)):
    for j in range(len(corr.columns)):
        value = corr.iloc[i, j]
        plt.text(j, i, f"{value:.2f}", ha='center', va='center', color='white', fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt


df['tanggal'] = pd.to_datetime(df['tanggal'])
df['bulan'] = df['tanggal'].dt.month


fitur = ['Suhu Maks (deg of C)', 'Curah Hujan Hari Ini (mm)', 'pm25', 'total_sentimen']

# (rata-rata per bulan)
bulanan = df.groupby('bulan')[fitur].mean()


plt.figure(figsize=(12, 6))
for col in fitur:
    plt.plot(bulanan.index, bulanan[col], marker='o', label=col)
plt.legend()
plt.title('Trend Bulanan Cuaca, Polusi, dan Volume Sentimen')
plt.xlabel('Bulan')
plt.ylabel('Rata-rata')
plt.xticks(range(1,13))
plt.show()


import seaborn as sns
plt.figure(figsize=(12,8))
for i, col in enumerate(fitur, 1):
    plt.subplot(2,2,i)
    sns.boxplot(x='bulan', y=col, data=df, palette='tab20')
    plt.title(f'Persebaran {col} per Bulan')
plt.tight_layout()
plt.show()


print("Statistik Bulanan:")
print(bulanan)

# Jika ingin analisis musiman (berdasarkan aturan lokal), misal musim hujan (Nov-Apr), kemarau (Mei-Okt):
df['musim'] = df['bulan'].apply(lambda x: 'Hujan' if x in [11,12,1,2,3,4] else 'Kemarau')
musiman = df.groupby('musim')[fitur].mean()
print("\nStatistik Musiman:")
print(musiman)


In [None]:

kolom_numerik.max()
