<a href="https://colab.research.google.com/github/FardhanErfandyar/EarlyDetectionAnxietyTweets-CapstoneProject/blob/main/Capstone_Project_Fardhan_Erfandyar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install langchain_community
%pip install replicate
import os
import pandas as pd
from langchain_community.llms import Replicate
from getpass import getpass
import logging
from tqdm.auto import tqdm
import numpy as np



In [None]:
# ==============================================================================
# 1. SETUP LINGKUNGAN DAN API TOKEN
# ==============================================================================
# Setup logging untuk memantau proses
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Mengambil API Token dari Colab Secrets (cara yang aman)
try:
    from google.colab import userdata
    REPLICATE_API_TOKEN = userdata.get('REPLICATE_API_TOKEN') # Pastikan nama secret ini benar
    if REPLICATE_API_TOKEN is None:
        raise KeyError
    os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
    logging.info("Token Replicate berhasil dimuat dari Colab Secrets.")
except (ImportError, KeyError):
    logging.error("Token Replicate tidak ditemukan di Colab Secrets. Harap ikuti panduan untuk menambahkannya.")
    # Fallback jika tidak di Colab, minta input manual
    try:
        REPLICATE_API_TOKEN = getpass("Harap masukkan Replicate API Token Anda: ")
        os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
    except Exception as e:
        raise ValueError(f"Gagal mendapatkan REPLICATE_API_TOKEN: {e}")

In [None]:
# ==============================================================================
# 2. INISIALISASI MODEL GRANITE
# ==============================================================================
# Menggunakan model instruct yang baik dalam mengikuti instruksi dan contoh
model_id = "openai/o4-mini"

# Menyesuaikan parameter untuk tugas analisis
llm = Replicate(
    model=model_id,
    model_kwargs={
        "temperature": 0.5,
        "max_new_tokens": 150, # Memberi ruang untuk penalaran dan klasifikasi
        "top_k": 50,
        "top_p": 0.95,
        "repetition_penalty": 1.1
    }
)
logging.info(f"Model {model_id} berhasil diinisialisasi.")

In [None]:
# ==============================================================================
# 3. MEMUAT DAN MEMPERSIAPKAN DATA
# ==============================================================================
def prepare_data(train_path, test_path):
    """Memuat, mengelompokkan, dan menyiapkan data latih & uji."""
    try:
        df_train = pd.read_csv(train_path)
        df_test = pd.read_csv(test_path)

        # Membersihkan spasi ekstra dari nama kolom
        df_train.columns = df_train.columns.str.strip()
        df_test.columns = df_test.columns.str.strip()

        logging.info("File training dan testing berhasil dimuat.")
    except FileNotFoundError as e:
        logging.error(f"Error: {e}. Pastikan kedua file CSV ada di direktori.")
        return None, None
    except Exception as e:
        logging.error(f"Error saat memuat data: {e}")
        return None, None

    # Mengelompokkan data latih berdasarkan pengguna
    # Memastikan kolom 'label' ada sebelum melakukan agregasi
    if 'label' not in df_train.columns:
        logging.error("Kolom 'label' tidak ditemukan di file training.")
        return None, None

    train_grouped = df_train.groupby('username').agg({
        'full_text': lambda tweets: list(tweets),
        'label': 'first'
    }).reset_index()

    # Mengelompokkan data uji berdasarkan pengguna
    test_grouped = df_test.groupby('username')['full_text'].apply(list).reset_index()

    logging.info(f"{len(train_grouped)} pengguna unik di data latih.")
    logging.info(f"{len(test_grouped)} pengguna unik di data uji.")

    return train_grouped, test_grouped

In [None]:
# ==============================================================================
# 4. MEMBUAT PROMPT KOMPREHENSIF DENGAN CONTOH
# ==============================================================================
def create_few_shot_prompt(train_df, user_to_classify_tweets):
    """Membuat prompt few-shot yang dinamis dari data latih."""

    # Ambil 2 contoh untuk setiap kelas
    anxious_examples = train_df[train_df['label'] == 1].head(2)
    not_anxious_examples = train_df[train_df['label'] == 0].head(2)
    examples = pd.concat([anxious_examples, not_anxious_examples])

    example_text = ""
    for _, row in examples.iterrows():
        label_text = "Anxious" if row['label'] == 1 else "Not Anxious"
        tweets_str = "\n".join([f"- {t}" for t in row['full_text']])
        example_text += f"""---
**Example (Known Classification):**
**User's Tweets:**
{tweets_str}
**Analysis:**
- **Reasoning:** This user's tweets show a pattern of {('worry and self-doubt.' if label_text == 'Anxious' else 'general daily life topics.')}
- **Classification:** {label_text}
"""

    # Format tweet dari pengguna yang akan diklasifikasikan
    user_tweets_str = "\n".join([f"- {t}" for t in user_to_classify_tweets])

    # Gabungkan semua bagian menjadi prompt final
    final_prompt = f"""You are a highly skilled psychological analyst specializing in detecting signs of anxiety from social media text.
Your task is to analyze a collection of recent tweets from a single user and determine if their overall posting pattern indicates potential anxiety.

**Definitions:**
- **Anxious:** The user's tweets, when viewed together, show a recurring pattern of worry, fear, panic, overthinking, social withdrawal, or direct mentions of an anxiety diagnosis. The sentiment is often negative or tense.
- **Not Anxious:** The user's tweets cover a variety of general topics without a recurring pattern of the anxious traits mentioned above.

**Instructions:**
1. Carefully read all the provided tweets from the user.
2. Analyze the overall pattern, tone, and topics. Do not judge based on a single tweet.
3. Provide your reasoning in one short sentence.
4. Provide your final classification in the specified format.

**LEARNING EXAMPLES:**
{example_text}
---
**USER TO CLASSIFY:**
**User's Tweets:**
{user_tweets_str}

**YOUR ANALYSIS:**
- **Reasoning:**
- **Classification:**"""

    return final_prompt


In [None]:
# ==============================================================================
# 5. FUNGSI UTAMA UNTUK MENJALANKAN PROYEK
# ==============================================================================
def run_project():
    # **PERBAIKAN UTAMA**: Gunakan path file yang benar untuk training dan testing
    train_df, test_df = prepare_data(
        '/content/final_training_dataset.csv', # Path ke data latih Anda (yang memiliki label)
        '/content/final_testing_dataset.csv'  # Path ke data uji Anda (yang tidak memiliki label)
    )

    if train_df is None or test_df is None:
        logging.error("Data preparation failed. Exiting.")
        return

    results = []
    # Menggunakan tqdm untuk progress bar saat memproses setiap akun
    for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Classifying Accounts"):
        username = row['username']
        tweets = row['full_text']

        print(f"\n---> Menganalisis akun: @{username}...")

        # Membuat prompt yang spesifik untuk pengguna ini
        prompt = create_few_shot_prompt(train_df, tweets)

        # Memanggil LLM
        response = llm.invoke(prompt)

        # Ekstrak hasil dari output model
        reasoning = "N/A"
        classification = "Unknown"

        lines = response.strip().split('\n')
        for line in lines:
            if "Reasoning:" in line:
                reasoning = line.replace("Reasoning:", "").strip()
            if "Classification:" in line:
                classification = line.replace("Classification:", "").strip()

        results.append({
            'username': username,
            'predicted_condition': classification,
            'reasoning': reasoning,
            'tweet_count': len(tweets)
        })
        print(f"  -> Hasil untuk @{username}: {classification} (Alasan: {reasoning})")

    # Simpan hasil akhir ke CSV
    results_df = pd.DataFrame(results)
    output_filename = "classified_accounts_results.csv"
    results_df.to_csv(output_filename, index=False)

    print("\n" + "="*20 + " PROSES SELESAI " + "="*20)
    print(f"Hasil klasifikasi untuk {len(results_df)} akun disimpan ke '{output_filename}'")
    print("\nRingkasan Prediksi:")
    print(results_df['predicted_condition'].value_counts())
    print("\nContoh Hasil:")
    print(results_df.head())

# Menjalankan fungsi utama
if __name__ == '__main__':
    run_project()



Classifying Accounts:   0%|          | 0/92 [00:00<?, ?it/s]


---> Menganalisis akun: @167FM_...
  -> Hasil untuk @167FM_: **** Not Anxious (Alasan: **** The user’s tweets are casual day-to-day chatter and problem-solving without recurring worry, panic, or overthinking.)

---> Menganalisis akun: @Dnashf...
  -> Hasil untuk @Dnashf: Not Anxious (Alasan: The user’s posts are a mix of casual daily updates, excitement, and typical student stress without a persistent pattern of worry or fear.)

---> Menganalisis akun: @PamulangBased...
  -> Hasil untuk @PamulangBased: -  Not Anxious (Alasan: -  The user’s posts are dominated by commentary on films, sports, and casual venting rather than recurring worry, fear, or self-doubt.)

---> Menganalisis akun: @_bomnall...
  -> Hasil untuk @_bomnall: **** Anxious (Alasan: **** The user repeatedly expresses stress, worry, exhaustion and overthinking triggered by everyday events.)

---> Menganalisis akun: @_cazhrr...
  -> Hasil untuk @_cazhrr: **** Not Anxious (Alasan: **** The user’s tweets are predominantly cas

ReplicateError: ReplicateError Details:
title: Your account has been temporarily disabled.
status: 402
detail: To re-enable your account, please make sure you have a working payment method setup and all your outstanding charges are paid up. Go to https://replicate.com/account/billing#billing to see billing status. You may contact us at https://replicate.com/support if you are having trouble paying.