<a href="https://colab.research.google.com/github/FardhanErfandyar/EarlyDetectionAnxietyTweets-CapstoneProject/blob/main/Capstone_Project_Fardhan_Erfandyar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Library
%pip install langchain_community
%pip install replicate
import os
import pandas as pd
from langchain_community.llms import Replicate
from getpass import getpass
import logging
from tqdm.auto import tqdm
import numpy as np

In [None]:
# ==============================================================================
# 1. Setup Environment and API Token
# ==============================================================================

# API Token from Colab Secrets
try:
    from google.colab import userdata
    REPLICATE_API_TOKEN = userdata.get('REPLICATE_API_TOKEN')
    if REPLICATE_API_TOKEN is None:
        raise KeyError
    os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
    logging.info("Token Replicate berhasil dimuat dari Colab Secrets.")
except (ImportError, KeyError):
    logging.error("Token Replicate tidak ditemukan di Colab Secrets. Harap ikuti panduan untuk menambahkannya.")

    try:
        REPLICATE_API_TOKEN = getpass("Harap masukkan Replicate API Token Anda: ")
        os.environ["REPLICATE_API_TOKEN"] = REPLICATE_API_TOKEN
    except Exception as e:
        raise ValueError(f"Gagal mendapatkan REPLICATE_API_TOKEN: {e}")

In [None]:
# ==============================================================================
# 2. Inisiate Model Openai O4 Mini
# ==============================================================================
model_id = "openai/o4-mini"

# Tuning Paramenter MOdel
llm = Replicate(
    model=model_id,
    model_kwargs={
        "temperature": 0.5,
        "max_new_tokens": 150,
        "top_k": 50,
        "top_p": 0.95,
        "repetition_penalty": 1.1
    }
)
logging.info(f"Model {model_id} berhasil diinisialisasi.")

In [None]:
# ==============================================================================
# 3. Preparing Data
# ==============================================================================
def prepare_data(train_path, test_path):
    """Memuat, mengelompokkan, dan menyiapkan data latih & uji."""
    try:
        df_train = pd.read_csv(train_path)
        df_test = pd.read_csv(test_path)

        # Cleaning
        df_train.columns = df_train.columns.str.strip()
        df_test.columns = df_test.columns.str.strip()

        logging.info("File training dan testing berhasil dimuat.")
    except FileNotFoundError as e:
        logging.error(f"Error: {e}. Pastikan kedua file CSV ada di direktori.")
        return None, None
    except Exception as e:
        logging.error(f"Error saat memuat data: {e}")
        return None, None

    # Group Training Data
    if 'label' not in df_train.columns:
        logging.error("Kolom 'label' tidak ditemukan di file training.")
        return None, None

    train_grouped = df_train.groupby('username').agg({
        'full_text': lambda tweets: list(tweets),
        'label': 'first'
    }).reset_index()

    # Group Testing Data
    test_grouped = df_test.groupby('username')['full_text'].apply(list).reset_index()

    logging.info(f"{len(train_grouped)} pengguna unik di data latih.")
    logging.info(f"{len(test_grouped)} pengguna unik di data uji.")

    return train_grouped, test_grouped

In [None]:
# ==============================================================================
# 4. Create Shot Prompt
# ==============================================================================
def create_few_shot_prompt(train_df, user_to_classify_tweets):
    """Membuat prompt few-shot yang dinamis dari data latih."""

    anxious_examples = train_df[train_df['label'] == 1].head(2)
    not_anxious_examples = train_df[train_df['label'] == 0].head(2)
    examples = pd.concat([anxious_examples, not_anxious_examples])

    example_text = ""
    for _, row in examples.iterrows():
        label_text = "Anxious" if row['label'] == 1 else "Not Anxious"
        tweets_str = "\n".join([f"- {t}" for t in row['full_text']])
        example_text += f"""---
**Example (Known Classification):**
**User's Tweets:**
{tweets_str}
**Analysis:**
- **Reasoning:** This user's tweets show a pattern of {('worry and self-doubt.' if label_text == 'Anxious' else 'general daily life topics.')}
- **Classification:** {label_text}
"""

    # Format Tweets
    user_tweets_str = "\n".join([f"- {t}" for t in user_to_classify_tweets])

    # Final Prompt
    final_prompt = f"""You are a highly skilled psychological analyst specializing in detecting signs of anxiety from social media text.
Your task is to analyze a collection of recent tweets from a single user and determine if their overall posting pattern indicates potential anxiety.

**Definitions:**
- **Anxious:** The user's tweets, when viewed together, show a recurring pattern of worry, fear, panic, overthinking, social withdrawal, or direct mentions of an anxiety diagnosis. The sentiment is often negative or tense.
- **Not Anxious:** The user's tweets cover a variety of general topics without a recurring pattern of the anxious traits mentioned above.

**Instructions:**
1. Carefully read all the provided tweets from the user.
2. Analyze the overall pattern, tone, and topics. Do not judge based on a single tweet.
3. Provide your reasoning in one short sentence.
4. Provide your final classification in the specified format.

**LEARNING EXAMPLES:**
{example_text}
---
**USER TO CLASSIFY:**
**User's Tweets:**
{user_tweets_str}

**YOUR ANALYSIS:**
- **Reasoning:**
- **Classification:**"""

    return final_prompt


In [None]:
# ==============================================================================
# 5. Main Function
# ==============================================================================
def run_project():
    train_df, test_df = prepare_data(
        '/content/final_training_dataset.csv',
        '/content/final_testing_dataset.csv'
    )

    if train_df is None or test_df is None:
        logging.error("Data preparation failed. Exiting.")
        return

    results = []
    # Progress Bar
    for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Classifying Accounts"):
        username = row['username']
        tweets = row['full_text']

        print(f"\n---> Menganalisis akun: @{username}...")

        # Prompt
        prompt = create_few_shot_prompt(train_df, tweets)

        # Call LLM Openai 04 Mini
        response = llm.invoke(prompt)

        # Output Model
        reasoning = "N/A"
        classification = "Unknown"

        lines = response.strip().split('\n')
        for line in lines:
            if "Reasoning:" in line:
                reasoning = line.replace("Reasoning:", "").strip()
            if "Classification:" in line:
                classification = line.replace("Classification:", "").strip()

        results.append({
            'username': username,
            'predicted_condition': classification,
            'reasoning': reasoning,
            'tweet_count': len(tweets)
        })
        print(f"  -> Hasil untuk @{username}: {classification} (Alasan: {reasoning})")

if __name__ == '__main__':
    run_project()

