## Temp (Anything)

### Checking Missing value/ Nan dari csv

In [1]:
import pandas as pd

# Load file dengan index asli
cleaned_text_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/Cleaned_Text.csv'
df = pd.read_csv(cleaned_text_path, index_col=0)

# Cari baris yang kosong/NaN
mask_empty = df["clean_text"].isna() | (df["clean_text"].astype(str).str.strip() == "")
empty_indices = df.index[mask_empty].tolist()

print(f"Jumlah row kosong/NaN: {len(empty_indices)}")
print("Index yang kosong/NaN:", empty_indices)

# Drop baris yang kosong
df_cleaned = df[~mask_empty].copy()

print(f"/nJumlah row setelah drop: {len(df_cleaned)}")

# Simpan kembali (overwrite file lama atau simpan sebagai file baru)
#output_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/Cleaned_Text.csv'
#df_cleaned.to_csv(output_path, index=True)
#print(f"✅ File baru disimpan di: {output_path}")


Jumlah row kosong/NaN: 0
Index yang kosong/NaN: []
/nJumlah row setelah drop: 16305


## Formatting Dataset csv Cleaned
disini buat coba formatting kolom age_account, created_at, sama followers_count

#### Load Dataset

In [None]:
import pandas as pd
import re

# Path file
path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/Cleaned_Text.csv'
print(f"Loading data from: {path}")
df = pd.read_csv(path)
print("Data loaded successfully.")



Loading data from: E:/$7th/TA/Eksploring_TF-IDF/DATA/Cleaned_Text.csv
Data loaded successfully.


#### Cleaning created_at

In [6]:
def clean_created_at(df, col="created_at"):
    # coba parsing otomatis semua format
    df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True, utc=True)
    # hilangkan timezone biar rapi
    df[col] = df[col].dt.tz_localize(None)
    # format ulang
    df[col] = df[col].dt.strftime("%Y-%m-%d %H:%M:%S")
    return df

df = clean_created_at(df)
print(df[["created_at"]].head(10))


            created_at
0  2025-01-02 14:32:02
1  2025-02-25 10:00:56
2  2025-02-28 08:56:27
3  2025-03-20 13:07:10
4  2025-02-01 14:11:09
5  2025-04-08 14:55:24
6  2025-03-02 06:07:20
7  2025-04-18 03:14:37
8  2025-02-05 08:14:33
9  2025-04-05 12:57:37


  df[col] = pd.to_datetime(df[col], errors="coerce", infer_datetime_format=True, utc=True)


#### Cleaning age_account

In [4]:
def convert_age_to_months(text):
    if pd.isna(text):
        return None
    match = re.findall(r'(/d+)', str(text))
    if not match:
        return None
    tahun = int(match[0]) if len(match) > 0 else 0
    bulan = int(match[1]) if len(match) > 1 else 0
    return tahun * 12 + bulan

def clean_age_account(df, col="age_account"):
    df[col] = df[col].apply(convert_age_to_months)
    return df

df = clean_age_account(df)
print(df[["age_account"]].head())


   age_account
0           80
1          140
2           20
3          165
4          116


#### Cleaning followers_count

In [5]:
def clean_followers_count(df, col="followers_count"):
    df[col] = df[col].fillna(0).astype(int)
    return df

df = clean_followers_count(df)
print(df[["followers_count"]].head())


   followers_count
0            91600
1              695
2              133
3              353
4           128000


#### Save hasil

In [None]:
output_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/Cleaned_Text.csv'
df.to_csv(output_path, index=False)
print("✅ Data berhasil dibersihkan & disimpan ke:", output_path)


✅ Data berhasil dibersihkan & disimpan ke: E:/$7th/TA/Eksploring_TF-IDF/DATA/Cleaned_Text.csv


## Making New Master DataFrame
Ini buat nambahin kolom image_path ke csv. Isi image_path adalah path local dari gambar tujuannya biar mudah train_split_test

In [None]:
# ===== MEMBUAT DATAFRAME MASTER =====

import os
import pandas as pd

def create_and_save_master_dataframe(csv_path, img_dir, output_path):
    """
    Menggabungkan data teks dari CSV dengan path file gambar, memverifikasi
    keberadaan gambar, dan menyimpan hasilnya ke file CSV baru.
    
    Args:
        csv_path (str): Path ke file CSV berisi data teks.
        img_dir (str): Path ke folder berisi file gambar.
        output_path (str): Path untuk menyimpan DataFrame master.
        
    Returns:
        pd.DataFrame: DataFrame master yang telah dibuat.
    """
    print("--- Memulai Proses Pembuatan DataFrame Master ---")
    
    # 1. Muat DataFrame Teks
    try:
        df = pd.read_csv(csv_path, index_col=0)
        print(f"Dataset teks berhasil dimuat. Jumlah baris awal: {len(df)}")
    except FileNotFoundError:
        print(f"ERROR: File tidak ditemukan di '{csv_path}'. Proses dihentikan.")
        return None

    # 2. Buat Kolom Path Gambar
    print("Menambahkan path gambar ke DataFrame...")
    df['image_path'] = df.index.to_series().apply(lambda idx: os.path.join(img_dir, f"{idx}.jpg"))

    # 3. Verifikasi Keberadaan File Gambar
    print("Memverifikasi keberadaan file gambar...")
    df['image_exists'] = df['image_path'].apply(os.path.exists)
    missing_images_count = (df['image_exists'] == False).sum()

    if missing_images_count > 0:
        print(f"/nPERINGATAN: Ditemukan {missing_images_count} baris yang tidak memiliki file gambar.")
        df = df[df['image_exists'] == True].copy() # Filter dan buat salinan
        print(f"Baris tanpa gambar telah dihapus. Jumlah baris sekarang: {len(df)}")
    else:
        print("✅ Semua baris memiliki file gambar yang sesuai.")

    # Hapus kolom verifikasi karena sudah tidak diperlukan
    df = df.drop(columns=['image_exists'])

    # 4. Simpan DataFrame Master
    try:
        df.to_csv(output_path, index=True) # index=True untuk menyimpan index unik
        print(f"/nDataFrame Master berhasil disimpan di: {output_path}")
    except Exception as e:
        print(f"ERROR: Gagal menyimpan file. Error: {e}")
        return None
        
    print("--- Proses Selesai ---")
    return df

# --- Jalankan Fungsi ---
csv_input_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/Cleaned_Text.csv'
image_input_dir = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/image_url'
master_output_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/DataFrame_Master.csv'

master_df = create_and_save_master_dataframe(csv_input_path, image_input_dir, master_output_path)


--- Memulai Proses Pembuatan DataFrame Master ---
Dataset teks berhasil dimuat. Jumlah baris awal: 16305
Menambahkan path gambar ke DataFrame...
Memverifikasi keberadaan file gambar...
✅ Semua baris memiliki file gambar yang sesuai.

DataFrame Master berhasil disimpan di: E:/$7th/TA/Eksploring_TF-IDF/DATA/DataFrame_Master.csv
--- Proses Selesai ---


#### Split Data

In [None]:
# ===== CELL 2: SPLIT DATA LATIH DAN DATA UJI =====

import pandas as pd
from sklearn.model_selection import train_test_split

def split_data(master_df_path):
    """
    Memuat DataFrame master dan membaginya menjadi set data latih dan uji.
    
    Args:
        master_df_path (str): Path ke file DataFrame_Master.csv.
        
    Returns:
        tuple: Berisi (train_df, test_df).
    """
    print("/n--- Memulai Proses Split Data ---")
    
    # 1. Muat DataFrame Master
    try:
        df = pd.read_csv(master_df_path, index_col=0)
        print(f"DataFrame Master berhasil dimuat. Total baris: {len(df)}")
    except FileNotFoundError:
        print(f"ERROR: File tidak ditemukan di '{master_df_path}'. Pastikan Cell 1 sudah dijalankan.")
        return None, None

    # 2. Pisahkan Fitur (X) dan Target (y)
    X = df
    y = df['Label']

    # 3. Lakukan Split Data dengan Stratifikasi
    print("Melakukan split data (80% train, 20% test)...")
    train_df, test_df = train_test_split(X, test_size=0.2, random_state=42, stratify=y)

    # 4. Tampilkan Ringkasan Hasil Split
    print("/n" + "="*40)
    print("      PROSES SPLIT DATA SELESAI")
    print("="*40)
    print(f"/nUkuran Data Latih (Train): {train_df.shape}")
    print(f"Ukuran Data Uji (Test): {test_df.shape}")
    print("/nDistribusi Label pada Data Latih (proporsi):")
    print(train_df['Label'].value_counts(normalize=True))
    print("/nDistribusi Label pada Data Uji (proporsi):")
    print(test_df['Label'].value_counts(normalize=True))
    
    print("/n--- Proses Selesai ---")
    return train_df, test_df

# --- Jalankan Fungsi ---
master_file_path = r'/home/spil/1Bagus/BACKUP/TA/Multimodal_Process_Exploration/DATA/DataFrame_Master.csv'
train_df, test_df = split_data(master_file_path)

if train_df is not None:
    print("/nContoh 5 baris pertama dari Data Latih (train_df):")
    print(train_df.head())


--- Memulai Proses Split Data ---
DataFrame Master berhasil dimuat. Total baris: 16305
Melakukan split data (80% train, 20% test)...

      PROSES SPLIT DATA SELESAI

Ukuran Data Latih (Train): (13044, 5)
Ukuran Data Uji (Test): (3261, 5)

Distribusi Label pada Data Latih (proporsi):
Label
0    0.699632
1    0.300368
Name: proportion, dtype: float64

Distribusi Label pada Data Uji (proporsi):
Label
0    0.699479
1    0.300521
Name: proportion, dtype: float64

--- Proses Selesai ---

Contoh 5 baris pertama dari Data Latih (train_df):
                                              full_text  /
9856  @Franken_blues George Soros yg jahat banget ba...   
7567  pergerakan IHSG 3 hari terakhir dibuka down me...   
727   Ramadan penuh berkah saatnya nonton Shopee Liv...   
4934  Rupiah menguat gaes... . . . . . . . . . . . ....   
9659  Penutupan sesi 2 Jumat 13 Juni 2025 IHSG 7 166...   

                                             clean_text  image_corelation  /
9856  george soros jahat ban