Tujuan notebook ini adalah untuk membersihkan dan mengubah data mentah menjadi format yang bersih, terstruktur, dan siap untuk diumpankan ke model AI.

# **Setup dan Import Library**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

print("✅ All libraries imported successfully!")
print("🔧 Starting Data Preprocessing Phase...")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


✅ All libraries imported successfully!
🔧 Starting Data Preprocessing Phase...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# **Load Dataset**

In [None]:
from google.colab import files

print("📁 Please upload your AI Assistant Usage dataset (same file as Notebook 1):")
uploaded = files.upload()

filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"✅ Dataset loaded: {df.shape[0]} rows × {df.shape[1]} columns")

# Display basic info
print("\n📋 Dataset Overview:")
print(df.head())
print(f"\nColumns: {list(df.columns)}")

📁 Please upload your AI Assistant Usage dataset (same file as Notebook 1):


Saving ai_assistant_usage_student_life.csv to ai_assistant_usage_student_life.csv
✅ Dataset loaded: 10000 rows × 11 columns

📋 Dataset Overview:
      SessionID   StudentLevel        Discipline SessionDate  \
0  SESSION00001  Undergraduate  Computer Science  2024-11-03   
1  SESSION00002  Undergraduate        Psychology  2024-08-25   
2  SESSION00003  Undergraduate          Business  2025-01-12   
3  SESSION00004  Undergraduate  Computer Science  2025-05-06   
4  SESSION00005  Undergraduate        Psychology  2025-03-18   

   SessionLengthMin  TotalPrompts  TaskType  AI_AssistanceLevel  \
0             31.20            11  Studying                   2   
1             13.09             6  Studying                   3   
2             19.22             5    Coding                   3   
3              3.70             1    Coding                   3   
4             28.12             9   Writing                   3   

           FinalOutcome  UsedAgain  SatisfactionRating  
0  Assignm

# **Data Cleaning**

Kode ini melakukan proses **pembersihan data** yang mencakup tiga tahap utama:

1. **Menangani nilai hilang**

   * Mengidentifikasi kolom yang memiliki missing values.
   * Mengisi nilai hilang berdasarkan tipe data dan persentasenya:

     * Data kategorikal/teks diisi dengan *mode* atau `"Unknown"`.
     * Data numerik diisi dengan *median*.
     * Kolom dengan terlalu banyak nilai hilang diberi catatan untuk dipertimbangkan dihapus.

2. **Menghapus duplikat**

   * Menghitung jumlah baris duplikat sebelum dan sesudah pembersihan.
   * Menghapus baris duplikat jika ditemukan.

3. **Optimisasi tipe data**

   * Mengubah kolom bertipe *object* menjadi *category* jika proporsi nilai uniknya kecil (<10%).

Hasil akhir menampilkan bentuk dataset setelah pembersihan, termasuk perubahan jumlah baris, kolom, dan distribusi tipe data.


In [None]:
print("\n" + "="*60)
print("🧹 DATA CLEANING")
print("="*60)

# Original dataset info
print(f"Original dataset shape: {df.shape}")

# 1. Handle missing values
print("\n❓ Missing Values Handling:")
missing_before = df.isnull().sum().sum()
print(f"Missing values before cleaning: {missing_before}")

# Identify missing patterns
missing_data = df.isnull().sum()
missing_cols = missing_data[missing_data > 0].index.tolist()

if missing_cols:
    print(f"Columns with missing values: {missing_cols}")

    for col in missing_cols:
        missing_count = df[col].isnull().sum()
        missing_percent = (missing_count / len(df)) * 100
        print(f"  - {col}: {missing_count} ({missing_percent:.2f}%)")

        if df[col].dtype == 'object':
            # For categorical/text data, fill with 'Unknown' or mode
            if missing_percent < 10:
                mode_val = df[col].mode()
                if len(mode_val) > 0:
                    df[col].fillna(mode_val[0], inplace=True)
                    print(f"    → Filled with mode: {mode_val[0]}")
                else:
                    df[col].fillna('Unknown', inplace=True)
                    print(f"    → Filled with 'Unknown'")
            else:
                df[col].fillna('Unknown', inplace=True)
                print(f"    → Filled with 'Unknown'")
        else:
            # For numerical data, fill with median
            if missing_percent < 10:
                median_val = df[col].median()
                df[col].fillna(median_val, inplace=True)
                print(f"    → Filled with median: {median_val}")
            else:
                # If too many missing, consider dropping the column
                print(f"    → Too many missing values ({missing_percent:.2f}%), consider dropping")

missing_after = df.isnull().sum().sum()
print(f"\nMissing values after cleaning: {missing_after}")

# 2. Handle duplicates
print("\n🔄 Duplicate Handling:")
duplicates_before = df.duplicated().sum()
print(f"Duplicates before: {duplicates_before}")

if duplicates_before > 0:
    df = df.drop_duplicates()
    duplicates_after = df.duplicated().sum()
    print(f"Duplicates after: {duplicates_after}")
    print(f"Rows removed: {duplicates_before}")

# 3. Data type optimization
print("\n📊 Data Type Optimization:")
print("Before optimization:")
print(df.dtypes.value_counts())

# Convert appropriate columns to categorical
for col in df.columns:
    if df[col].dtype == 'object':
        unique_ratio = df[col].nunique() / len(df)
        if unique_ratio < 0.1:  # If less than 10% unique values, make categorical
            df[col] = df[col].astype('category')
            print(f"  - {col} converted to category ({df[col].nunique()} unique values)")

print("\nAfter optimization:")
print(df.dtypes.value_counts())

print(f"\n✅ Dataset shape after cleaning: {df.shape}")


🧹 DATA CLEANING
Original dataset shape: (10000, 11)

❓ Missing Values Handling:
Missing values before cleaning: 0

Missing values after cleaning: 0

🔄 Duplicate Handling:
Duplicates before: 0

📊 Data Type Optimization:
Before optimization:
float64     2
int64       2
object      1
category    1
category    1
category    1
category    1
category    1
bool        1
Name: count, dtype: int64

After optimization:
float64     2
int64       2
object      1
category    1
category    1
category    1
category    1
category    1
bool        1
Name: count, dtype: int64

✅ Dataset shape after cleaning: (10000, 11)


# **Text Preprocessing**

Kode ini menjalankan **text preprocessing** secara menyeluruh dengan langkah-langkah berikut:

1. **Identifikasi kolom teks**

   * Mencari kolom bertipe *object* atau *category* yang berisi teks panjang (>50 karakter) atau memiliki kata kunci seperti `"comment"`, `"feedback"`, `"description"`, `"text"`, atau `"review"`.

2. **Persiapan alat preprocessing**

   * Menggunakan **WordNetLemmatizer** untuk normalisasi kata.
   * Menghapus **stopwords** bahasa Inggris.

3. **Fungsi `preprocess_text`** (proses pembersihan teks):

   * Mengubah teks menjadi huruf kecil.
   * Menghapus URL, email, tag HTML, karakter khusus, angka, dan spasi berlebih.
   * Tokenisasi teks menjadi kata-kata.
   * Menghapus stopwords dan kata pendek (<3 huruf).
   * Melakukan **lemmatization** agar kata kembali ke bentuk dasarnya.

4. **Penerapan preprocessing**

   * Memproses setiap kolom teks menjadi kolom baru dengan akhiran `_processed`.
   * Menampilkan contoh sebelum dan sesudah preprocessing.
   * Menghitung rata-rata panjang teks sebelum dan sesudah pembersihan.

Hasilnya adalah dataset dengan versi teks yang sudah bersih, terstruktur, dan siap untuk analisis lanjutan atau pemodelan NLP.


In [None]:
print("\n" + "="*60)
print("📝 TEXT PREPROCESSING")
print("="*60)

# Identify text columns
text_columns = []
for col in df.columns:
    if df[col].dtype in ['object', 'category']:
        # Check if it's likely a text column
        sample_text = df[col].dropna().astype(str).iloc[0] if not df[col].dropna().empty else ""
        if len(sample_text) > 50 or any(keyword in col.lower() for keyword in ['comment', 'feedback', 'description', 'text', 'review']):
            text_columns.append(col)

print(f"Text columns identified: {text_columns}")

# Initialize text preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Comprehensive text preprocessing function
    """
    if pd.isna(text):
        return ""

    # Convert to string and lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and short words
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Apply text preprocessing
for col in text_columns:
    print(f"\n🔧 Preprocessing text column: {col}")

    # Show sample before preprocessing
    print("Sample before preprocessing:")
    sample_idx = df[col].dropna().index[0] if not df[col].dropna().empty else None
    if sample_idx is not None:
        print(f"  Original: {df.loc[sample_idx, col][:200]}...")

    # Create preprocessed version
    preprocessed_col = f"{col}_processed"
    df[preprocessed_col] = df[col].apply(preprocess_text)

    # Show sample after preprocessing
    if sample_idx is not None:
        print(f"  Processed: {df.loc[sample_idx, preprocessed_col][:200]}...")

    # Basic statistics
    avg_length_before = df[col].dropna().astype(str).str.len().mean()
    avg_length_after = df[preprocessed_col].str.len().mean()
    print(f"  Average length before: {avg_length_before:.1f} characters")
    print(f"  Average length after: {avg_length_after:.1f} characters")


📝 TEXT PREPROCESSING
Text columns identified: []


# **Feature Engineering**

Kode ini menjalankan **feature engineering** dengan beberapa tahap utama:

1. **Encoding variabel kategorikal**

   * Mengambil kolom kategorikal (`object`/`category`) yang bukan kolom teks.
   * Jika jumlah kategori ≤ 10 → **One-hot encoding**.
   * Jika jumlah kategori > 10 → **Label encoding**.

2. **Scaling variabel numerik**

   * Mengambil semua kolom numerik (`int64`, `float64`).
   * Menggunakan **StandardScaler** untuk standarisasi (mean=0, std=1).

3. **Ekstraksi fitur teks**

   * Menggunakan versi teks yang sudah diproses (`_processed`).
   * Membuat fitur **TF-IDF** (max 100 fitur, unigram + bigram, batas frekuensi min 2 dokumen, max 80%).

4. **Membuat matriks fitur akhir**

   * Menggabungkan semua fitur hasil encoding, scaling, dan ekstraksi teks menjadi satu DataFrame (`X_processed`).
   * Menampilkan jumlah fitur akhir (baris × kolom).

Hasil akhirnya adalah **dataset terstandardisasi dan terenkode** yang siap digunakan untuk pemodelan machine learning.


In [None]:
print("\n" + "="*60)
print("⚙️ FEATURE ENGINEERING")
print("="*60)

# 1. Categorical Encoding
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Remove text columns from categorical encoding
categorical_cols = [col for col in categorical_cols if col not in text_columns]

print(f"\n🏷️ Categorical columns for encoding: {categorical_cols}")

# Create encoded features
encoded_features = {}

for col in categorical_cols:
    print(f"\nEncoding {col}:")
    unique_values = df[col].nunique()
    print(f"  Unique values: {unique_values}")

    if unique_values <= 10:
        # One-hot encoding for low cardinality
        encoded = pd.get_dummies(df[col], prefix=col)
        encoded_features[col] = encoded
        print(f"  → One-hot encoded ({encoded.shape[1]} features)")
    else:
        # Label encoding for high cardinality
        le = LabelEncoder()
        encoded = le.fit_transform(df[col].astype(str))
        encoded_df = pd.DataFrame({f"{col}_encoded": encoded})
        encoded_features[col] = encoded_df
        print(f"  → Label encoded (1 feature)")

# 2. Numerical Feature Scaling
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(f"\n🔢 Numerical columns for scaling: {numerical_cols}")

scaled_features = {}
if numerical_cols:
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df[numerical_cols])
    scaled_features = pd.DataFrame(
        scaled_data,
        columns=[f"{col}_scaled" for col in numerical_cols],
        index=df.index
    )
    print(f"  → {len(numerical_cols)} numerical features scaled")

# 3. Text Feature Extraction
text_features = {}
for col in text_columns:
    processed_col = f"{col}_processed"
    if processed_col in df.columns:
        print(f"\n📄 Text feature extraction for {col}:")

        # TF-IDF Features
        tfidf = TfidfVectorizer(
            max_features=100,  # Limit features for demonstration
            ngram_range=(1, 2),  # Unigrams and bigrams
            min_df=2,  # Ignore terms that appear in less than 2 documents
            max_df=0.8  # Ignore terms that appear in more than 80% of documents
        )

        try:
            tfidf_features = tfidf.fit_transform(df[processed_col])
            feature_names = [f"{col}_tfidf_{name}" for name in tfidf.get_feature_names_out()]
            tfidf_df = pd.DataFrame(
                tfidf_features.toarray(),
                columns=feature_names,
                index=df.index
            )
            text_features[f"{col}_tfidf"] = tfidf_df
            print(f"  → TF-IDF features: {tfidf_df.shape[1]}")
        except Exception as e:
            print(f"  ⚠️ TF-IDF failed: {str(e)}")

# 4. Create Final Feature Matrix
print("\n🔨 Creating Final Feature Matrix:")

# Combine all features
feature_dfs = []

# Add encoded categorical features
for col, features in encoded_features.items():
    feature_dfs.append(features)
    print(f"  + {col}: {features.shape[1]} features")

# Add scaled numerical features
if len(scaled_features) > 0:
    feature_dfs.append(scaled_features)
    print(f"  + Numerical (scaled): {scaled_features.shape[1]} features")

# Add text features
for col, features in text_features.items():
    feature_dfs.append(features)
    print(f"  + {col}: {features.shape[1]} features")

# Combine all features
if feature_dfs:
    X_processed = pd.concat(feature_dfs, axis=1)
    print(f"\n✅ Final feature matrix: {X_processed.shape[0]} rows × {X_processed.shape[1]} features")
else:
    X_processed = df[numerical_cols] if numerical_cols else pd.DataFrame()
    print(f"\n⚠️ Using original numerical features: {X_processed.shape}")


⚙️ FEATURE ENGINEERING

🏷️ Categorical columns for encoding: ['SessionID', 'StudentLevel', 'Discipline', 'SessionDate', 'TaskType', 'FinalOutcome']

Encoding SessionID:
  Unique values: 10000
  → Label encoded (1 feature)

Encoding StudentLevel:
  Unique values: 3
  → One-hot encoded (3 features)

Encoding Discipline:
  Unique values: 7
  → One-hot encoded (7 features)

Encoding SessionDate:
  Unique values: 366
  → Label encoded (1 feature)

Encoding TaskType:
  Unique values: 6
  → One-hot encoded (6 features)

Encoding FinalOutcome:
  Unique values: 4
  → One-hot encoded (4 features)

🔢 Numerical columns for scaling: ['SessionLengthMin', 'TotalPrompts', 'AI_AssistanceLevel', 'SatisfactionRating']
  → 4 numerical features scaled

🔨 Creating Final Feature Matrix:
  + SessionID: 1 features
  + StudentLevel: 3 features
  + Discipline: 7 features
  + SessionDate: 1 features
  + TaskType: 6 features
  + FinalOutcome: 4 features
  + Numerical (scaled): 4 features

✅ Final feature matrix: 

# **Target Variable Prep**

Kode ini menyiapkan **kolom target** untuk pemodelan dengan langkah-langkah yang sederhana:

1. **Mencari kolom yang cocok jadi target**

   * Mencari kolom yang namanya mengandung kata seperti *category*, *type*, *class*, *usage*, *purpose*, atau *label*.
   * Kalau tidak ada, mencari kolom kategorikal yang jumlah kategorinya antara 2–10.

2. **Menampilkan kandidat target**

   * Menunjukkan daftar kolom yang bisa dijadikan target prediksi.

3. **Mengubah target jadi angka (encoding)**

   * Menggunakan **LabelEncoder** untuk mengganti setiap kategori dengan angka unik.
   * Menyimpan encoder ini supaya bisa digunakan lagi nanti.

Hasil akhirnya: kolom target sudah dalam bentuk angka sehingga siap dipakai untuk algoritma machine learning.


In [None]:
print("\n" + "="*60)
print("🎯 TARGET VARIABLE PREPARATION")
print("="*60)

# Identify potential target variables
potential_targets = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['category', 'type', 'class', 'usage', 'purpose', 'label']):
        potential_targets.append(col)

if not potential_targets:
    # If no obvious target, look for categorical columns with reasonable number of classes
    for col in categorical_cols:
        unique_count = df[col].nunique()
        if 2 <= unique_count <= 10:
            potential_targets.append(col)

print(f"Potential target variables: {potential_targets}")

# Prepare target variables
target_encoders = {}
y_encoded = {}

for target_col in potential_targets:
    print(f"\n🎯 Preparing target: {target_col}")
    print(f"  Classes: {df[target_col].value_counts().to_dict()}")

    # Encode target variable
    le = LabelEncoder()
    y_encoded[target_col] = le.fit_transform(df[target_col].astype(str))
    target_encoders[target_col] = le

    print(f"  Encoded classes: {dict(zip(le.classes_, le.transform(le.classes_)))}")



🎯 TARGET VARIABLE PREPARATION
Potential target variables: ['TaskType']

🎯 Preparing target: TaskType
  Classes: {'Writing': 3101, 'Studying': 2040, 'Homework Help': 1959, 'Coding': 1948, 'Brainstorming': 476, 'Research': 476}
  Encoded classes: {'Brainstorming': np.int64(0), 'Coding': np.int64(1), 'Homework Help': np.int64(2), 'Research': np.int64(3), 'Studying': np.int64(4), 'Writing': np.int64(5)}


# **Train Test Split**

Kode ini membagi data menjadi **data latih** dan **data uji** dengan langkah-langkah berikut:

1. **Menentukan target utama**

   * Mengambil target pertama dari daftar kandidat yang sudah ditemukan sebelumnya untuk dipakai dalam pemisahan data.

2. **Membagi data (train-test split)**

   * Memisahkan fitur (`X_processed`) dan target (`y_encoded`) menjadi:

     * **Data latih (80%)** – untuk melatih model.
     * **Data uji (20%)** – untuk menguji performa model.
   * Menggunakan parameter `stratify` agar proporsi tiap kelas di train dan test tetap seimbang.

3. **Menampilkan informasi pembagian data**

   * Jumlah sampel di data latih dan data uji.
   * Jumlah fitur yang digunakan.
   * Distribusi kelas target pada kedua subset.

4. **Jika tidak ada target**

   * Tetap membagi fitur menjadi train dan test, tapi tanpa target variabel.


In [None]:
print("\n" + "="*60)
print("📊 TRAIN-TEST SPLIT")
print("="*60)

if len(potential_targets) > 0:
    # Use first target for splitting
    main_target = potential_targets[0]
    print(f"Using {main_target} as main target for splitting")

    X_train, X_test, y_train, y_test = train_test_split(
        X_processed,
        y_encoded[main_target],
        test_size=0.2,
        random_state=42,
        stratify=y_encoded[main_target]
    )

    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    print(f"Features: {X_train.shape[1]}")

    # Show target distribution
    train_dist = pd.Series(y_train).value_counts().sort_index()
    test_dist = pd.Series(y_test).value_counts().sort_index()

    print("\nTarget distribution:")
    print("Train:", train_dist.to_dict())
    print("Test: ", test_dist.to_dict())

else:
    print("⚠️ No suitable target variable found for classification")
    X_train, X_test = train_test_split(X_processed, test_size=0.2, random_state=42)
    y_train = y_test = None


📊 TRAIN-TEST SPLIT
Using TaskType as main target for splitting
Training set: 8000 samples
Test set: 2000 samples
Features: 26

Target distribution:
Train: {0: 381, 1: 1558, 2: 1567, 3: 381, 4: 1632, 5: 2481}
Test:  {0: 95, 1: 390, 2: 392, 3: 95, 4: 408, 5: 620}


# **Data Validation**

Kode ini melakukan **validasi akhir data** sebelum digunakan untuk pemodelan, dengan langkah-langkah:

1. **Pemeriksaan nilai tak hingga (infinite values)**

   * Mengecek apakah ada nilai `∞` atau `-∞` di fitur numerik.
   * Jika ada, nilainya diganti menjadi `NaN` lalu diisi dengan `0`.

2. **Pemeriksaan nilai kosong (NaN)**

   * Mengecek apakah ada nilai `NaN` di data yang sudah diproses.
   * Jika ditemukan, semua `NaN` diisi dengan `0`.

3. **Ringkasan statistik fitur**

   * Menampilkan ukuran matriks fitur (`shape`).
   * Menghitung penggunaan memori.
   * Menyajikan jumlah masing-masing tipe data yang digunakan.


In [None]:
print("\n" + "="*60)
print("✅ DATA VALIDATION")
print("="*60)

# Check for any remaining issues
print("Final data validation:")

# 1. Check for infinite values
if X_processed.select_dtypes(include=[np.number]).isin([np.inf, -np.inf]).any().any():
    print("⚠️ Infinite values found in features")
    # Replace infinite values
    X_processed = X_processed.replace([np.inf, -np.inf], np.nan)
    X_processed = X_processed.fillna(0)
    print("  → Infinite values replaced with 0")
else:
    print("✅ No infinite values in features")

# 2. Check for NaN values
if X_processed.isnull().any().any():
    print("⚠️ NaN values found in processed features")
    nan_count = X_processed.isnull().sum().sum()
    print(f"  → {nan_count} NaN values found and filled with 0")
    X_processed = X_processed.fillna(0)
else:
    print("✅ No NaN values in processed features")

# 3. Feature statistics
print(f"\nFeature matrix statistics:")
print(f"  Shape: {X_processed.shape}")
print(f"  Memory usage: {X_processed.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  Data types: {X_processed.dtypes.value_counts().to_dict()}")



✅ DATA VALIDATION
Final data validation:
✅ No infinite values in features
✅ No NaN values in processed features

Feature matrix statistics:
  Shape: (10000, 26)
  Memory usage: 0.65 MB
  Data types: {dtype('bool'): 20, dtype('float64'): 4, dtype('int64'): 2}


# **Save Processed Data**

Kode ini menyimpan dan merangkum hasil akhir preprocessing data

* **Membuat ringkasan preprocessing** yang berisi:

  * **Bentuk data asli dan akhir** (jumlah baris × kolom sebelum dan sesudah pemrosesan).
  * **Kolom teks, kategorikal, dan numerikal** yang digunakan.
  * **Variabel target** yang teridentifikasi.
  * **Jumlah fitur baru** yang dihasilkan dari encoding kategorikal, scaling numerikal, dan ekstraksi teks, beserta total fitur akhir.

* **Menampilkan status akhir**:

  * Konfirmasi bahwa data siap untuk modeling, lengkap dengan jumlah fitur yang tersedia.
  * Menunjukkan target klasifikasi yang siap digunakan, jika ada.
  * Menyebutkan data teks yang siap dipakai untuk summarization.

* **Memberikan panduan langkah selanjutnya**, yaitu:

  1. Analisis klasifikasi.
  2. Ringkasan teks.
  3. Evaluasi model.
  4. Pembuatan dashboard visualisasi.

* **Informasi matriks fitur terakhir**:

  * Jumlah sampel dan fitur.
  * Nama variabel target utama beserta jumlah kelasnya.


In [None]:
print("\n" + "="*60)
print("💾 SAVE PROCESSED DATA")
print("="*60)

# Create summary of preprocessing steps
preprocessing_summary = {
    'original_shape': df.shape,
    'final_shape': X_processed.shape,
    'text_columns': text_columns,
    'categorical_columns': categorical_cols,
    'numerical_columns': numerical_cols,
    'target_variables': potential_targets,
    'features_created': {
        'encoded_categorical': sum(df.shape[1] for df in encoded_features.values()),
        'scaled_numerical': scaled_features.shape[1] if len(scaled_features) > 0 else 0,
        'text_features': sum(df.shape[1] for df in text_features.values()),
        'total_features': X_processed.shape[1]
    }
}

print("Preprocessing Summary:")
for key, value in preprocessing_summary.items():
    print(f"  {key}: {value}")

# Save data for next notebooks
print(f"\n✅ Data preprocessing completed!")
print(f"📊 Ready for modeling with {X_processed.shape[1]} features")

if potential_targets:
    print(f"🎯 Classification targets available: {potential_targets}")
if text_columns:
    print(f"📝 Text data ready for summarization: {text_columns}")

print("\n" + "="*60)
print("🚀 NEXT STEPS")
print("="*60)
print("1. Classification Analysis (Notebook 3)")
print("2. Text Summarization (Notebook 4)")
print("3. Model Evaluation (Notebook 5)")
print("4. Visualization Dashboard (Notebook 6)")

# Display final feature info
print(f"\n📋 Feature Matrix Ready: {X_processed.shape[0]} samples × {X_processed.shape[1]} features")
if len(potential_targets) > 0:
    print(f"🎯 Target Variable Ready: {main_target} ({len(np.unique(y_encoded[main_target]))} classes)")


💾 SAVE PROCESSED DATA
Preprocessing Summary:
  original_shape: (10000, 11)
  final_shape: (10000, 26)
  text_columns: []
  categorical_columns: ['SessionID', 'StudentLevel', 'Discipline', 'SessionDate', 'TaskType', 'FinalOutcome']
  numerical_columns: ['SessionLengthMin', 'TotalPrompts', 'AI_AssistanceLevel', 'SatisfactionRating']
  target_variables: ['TaskType']
  features_created: {'encoded_categorical': 22, 'scaled_numerical': 4, 'text_features': 0, 'total_features': 26}

✅ Data preprocessing completed!
📊 Ready for modeling with 26 features
🎯 Classification targets available: ['TaskType']

🚀 NEXT STEPS
1. Classification Analysis (Notebook 3)
2. Text Summarization (Notebook 4)
3. Model Evaluation (Notebook 5)
4. Visualization Dashboard (Notebook 6)

📋 Feature Matrix Ready: 10000 samples × 26 features
🎯 Target Variable Ready: TaskType (6 classes)
