In [1]:
# Data handling
import pandas as pd
import numpy as np
import re

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Save model
import joblib

In [2]:
# Load dataset
df = pd.read_csv("../data/labeled_thesis.csv", sep=";")

# Lihat info dasar
print(df.shape)
print(df.columns)
df.head() # Cek 5 data pertama dari awal

(193, 5)
Index(['File', 'Judul', 'Abstrak', 'Kata Kunci', 'topic_label'], dtype='object')


Unnamed: 0,File,Judul,Abstrak,Kata Kunci,topic_label
0,05311840000027-Undergraduate_Thesis - Fancista...,DESAIN ANTARMUKA APLIKASI UNTUK MEMBANTU PROSE...,Pindahan rumah merupakan aktivitas yang melela...,"aplikasi, desain antarmuka, design thinking, p...",UI/UX Design & HCI
1,05311840000033_Made Krisnanda Utama_BukuTA - K...,EVALUASI TATA KELOLA TEKNOLOGI INFORMASI DALAM...,EVALUASI TATA KELOLA TEKNOLOGI INFORMASI DALAM...,"Evaluasi, Keamanan Informasi, ITIL versi 4, Ru...",Data Science & Analytics
2,05311840000035-Undergraduate_Thesis - Agung Mu...,PERANCANGAN SISTEM PENGAMAN SEPEDA MOTOR BERBA...,Kasus pencurian kendaraan bermotor (curanmor) ...,"Sepeda Motor, RFID, e-KTP, GPS Neo 6M, NodeMCU...",IoT & Embedded Systems
3,05311840000040_Undergraduate_Thesis - Ghifari ...,PREDIKSI HARGA SAHAM BERDASARKAN ALGORITMA HID...,Saham adalah salah satu bagian instrumen terpe...,"financial market, bullish, bearish, hidden mar...",Machine Learning & AI
4,05311840000043_Justin Alfonsius_Sitanggang_Buk...,Pembuatan sistem asset tracking berbasis Globa...,Aset berharga karena dapat menghasilkan pendap...,"Aset, asset tracking, Global Positioning Syste...",Aplikasi Web & Mobile


In [3]:
# DATA PREPROCESS

In [4]:
# sudah ada beberapa import, tapi ulang di sini untuk memastikan
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder

# opsional: jika ingin pakai stopword/stemming Bahasa Indonesia
# install jika belum: !pip install Sastrawi nltk
import nltk
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [5]:
# gabungkan
df['text'] = df['Judul'].astype(str) + " " + df['Abstrak'].astype(str) + " " + df['Kata Kunci'].astype(str)

# ringkasan panjang teks
df['char_len'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().apply(len)

print("Panjang teks (chars) - summary:")
print(df['char_len'].describe())
print("\nWord count - summary:")
print(df['word_count'].describe())

# lihat beberapa contoh acak
df[['Judul','text']].sample(5, random_state=42)

Panjang teks (chars) - summary:
count     193.000000
mean     1879.062176
std       794.962136
min       144.000000
25%      1755.000000
50%      2033.000000
75%      2378.000000
max      4711.000000
Name: char_len, dtype: float64

Word count - summary:
count    193.000000
mean     246.362694
std      106.410139
min       14.000000
25%      230.000000
50%      268.000000
75%      312.000000
max      654.000000
Name: word_count, dtype: float64


Unnamed: 0,Judul,text
45,PERANCANGAN DAN IMPLEMENTASI “SECURED SMART HOME,PERANCANGAN DAN IMPLEMENTASI “SECURED SMART HO...
137,ANALISIS KINERJA LOAD BALANCING MENGGUNAKAN DR...,ANALISIS KINERJA LOAD BALANCING MENGGUNAKAN DR...
76,PROTOTIPE SISTEM IRIGASI CERDAS BERDASARKAN CR...,PROTOTIPE SISTEM IRIGASI CERDAS BERDASARKAN CR...
144,IMPLEMENTASI MODEL KOMPUTASI,IMPLEMENTASI MODEL KOMPUTASI Seiring dengan ke...
113,IMPLEMENTASI CHATBOT KESEHATAN,IMPLEMENTASI CHATBOT KESEHATAN IMPLEMENTASI CH...


In [6]:
# Cleaning Data
def clean_basic(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\n"," ").replace("\r"," ")
    text = re.sub(r'http\S+', ' ', text)          # hapus URL
    text = re.sub(r'\S+@\S+', ' ', text)          # hapus email
    text = re.sub(r'[^0-9A-Za-z\s]', ' ', text)   # sisakan huruf + angka + spasi
    text = re.sub(r'\s+', ' ', text).strip()      # rapikan spasi
    text = text.lower()
    return text

df['text_clean_basic'] = df['text'].apply(clean_basic)

# contoh perbandingan sebelum & sesudah
display(df[['text']].head(3))
display(df[['text_clean_basic']].head(3))

Unnamed: 0,text
0,DESAIN ANTARMUKA APLIKASI UNTUK MEMBANTU PROSE...
1,EVALUASI TATA KELOLA TEKNOLOGI INFORMASI DALAM...
2,PERANCANGAN SISTEM PENGAMAN SEPEDA MOTOR BERBA...


Unnamed: 0,text_clean_basic
0,desain antarmuka aplikasi untuk membantu prose...
1,evaluasi tata kelola teknologi informasi dalam...
2,perancangan sistem pengaman sepeda motor berba...


In [7]:
# Stopword removal + Stemming Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))  # kumpulan stopword bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def remove_stop_and_stem(text):
    if not isinstance(text, str) or text.strip()=="":
        return ""
    words = text.split()
    words = [w for w in words if w not in stop_words]
    joined = " ".join(words)
    stemmed = stemmer.stem(joined)
    return stemmed

# terapkan (memerlukan waktu tergantung jumlah data)
df['text_no_stop'] = df['text_clean_basic'].apply(lambda x: " ".join([w for w in x.split() if w not in stop_words]))
df['text_stemmed'] = df['text_no_stop'].apply(lambda x: stemmer.stem(x) if x.strip()!="" else "")

In [8]:
# Cek hasil cleaning data

# lihat contoh
display(df[['text','text_clean_basic','text_no_stop','text_stemmed']].sample(6, random_state=1))

# cek apakah ada teks yang kosong setelah preprocessing
for col in ['text_clean_basic','text_no_stop','text_stemmed']:
    n_empty = (df[col].astype(str).str.strip()=="").sum()
    print(f"{col} empty count:", n_empty)

Unnamed: 0,text,text_clean_basic,text_no_stop,text_stemmed
44,Pengembangan Aplikasi Serious Game untuk Melat...,pengembangan aplikasi serious game untuk melat...,pengembangan aplikasi serious game melatih pen...,kembang aplikasi serious game latih kenal eksp...
69,ANALISIS KINERJA LOAD BALANCING PADA CLOUD ENV...,analisis kinerja load balancing pada cloud env...,analisis kinerja load balancing cloud environt...,analisis kerja load balancing cloud environtme...
162,ANALISIS KOMPARATIF PERFORMA VIRTUALISASI PADA...,analisis komparatif performa virtualisasi pada...,analisis komparatif performa virtualisasi plat...,analisis komparatif performa virtual platform ...
35,PENGEMBANGAN ALAT BANTU Menurut data dari keme...,pengembangan alat bantu menurut data dari keme...,pengembangan alat bantu data kementerian keseh...,kembang alat bantu data menteri sehat ri 3 750...
183,Analisis Sentimen Komentar Sosial Media Pemilu...,analisis sentimen komentar sosial media pemilu...,analisis sentimen komentar sosial media pemilu...,analisis sentimen komentar sosial media milu m...
11,PENINGKATAN SISTEM TATA KELOLA KEAMANAN INFORM...,peningkatan sistem tata kelola keamanan inform...,peningkatan sistem tata kelola keamanan inform...,tingkat sistem tata kelola aman informasi tekn...


text_clean_basic empty count: 0
text_no_stop empty count: 0
text_stemmed empty count: 0


In [9]:
# Pilih kolom final untuk training TF-IDF
df["text_final"] = df["text_stemmed"]

# Simpan dataset hasil preprocessing
df.to_csv("../data/labeled_thesis_preprocessed.csv", index=False, encoding="utf-8")
print("Saved preprocessed CSV, shape:", df.shape)

# Preview
df[["Judul", "text_final"]].sample(5, random_state=42)

Saved preprocessed CSV, shape: (193, 12)


Unnamed: 0,Judul,text_final
45,PERANCANGAN DAN IMPLEMENTASI “SECURED SMART HOME,ancang implementasi secured smart home ancang ...
137,ANALISIS KINERJA LOAD BALANCING MENGGUNAKAN DR...,analisis kerja load balancing dragonfly algori...
76,PROTOTIPE SISTEM IRIGASI CERDAS BERDASARKAN CR...,prototipe sistem irigasi cerdas dasar crop wat...
144,IMPLEMENTASI MODEL KOMPUTASI,implementasi model komputasi iring maju teknol...
113,IMPLEMENTASI CHATBOT KESEHATAN,implementasi chatbot sehat implementasi chatbo...


In [10]:
# Encoding Label

from sklearn.preprocessing import LabelEncoder
import joblib

le = LabelEncoder()
df["label"] = le.fit_transform(df["topic_label"])

# Mapping label ke angka
print("Mapping Label:")
for i, cls in enumerate(le.classes_):
    print(i, ":", cls)

# Simpan encoder
joblib.dump(le, "../models/label_encoder.pkl")

Mapping Label:
0 : Aplikasi Web & Mobile
1 : Computer Vision & Image Processing
2 : Data Science & Analytics
3 : IoT & Embedded Systems
4 : Keamanan Informasi & Cybersecurity
5 : Machine Learning & AI
6 : Network & Systems
7 : UI/UX Design & HCI


['../models/label_encoder.pkl']

In [11]:
# TRAINING MODEL

In [12]:
from sklearn.model_selection import train_test_split

# Pisahkan data 90% train, 10% test dengan stratify
X_train, X_test, y_train, y_test = train_test_split(
    df["text_final"],
    df["label"],
    test_size=0.1,         # hanya 10% untuk test
    random_state=42,
    stratify=df["label"]   # pastikan distribusi label seimbang
)

print("Train size:", len(X_train))
print("Test size :", len(X_test))

Train size: 173
Test size : 20


In [13]:
# TF-IDF Vectorization
# Ubah teks menjadi representasi numerik.

from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF dengan unigram + bigram dan fitur lebih banyak
tfidf = TfidfVectorizer(
    ngram_range=(1,2),      # unigram + bigram
    max_features=8000,      # lebih banyak fitur
    sublinear_tf=True
)

# Transform train dan test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape :", X_test_tfidf.shape)

TF-IDF train shape: (173, 8000)
TF-IDF test shape : (20, 8000)


In [14]:
# Train Logistic Regression

from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression

# Oversampling
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train_tfidf, y_train)

print(f"Before ROS : {X_train_tfidf.shape} → After ROS : {X_train_res.shape}")

# Train Logistic Regression
model = LogisticRegression(
    max_iter=300,
    class_weight="balanced",
    solver="lbfgs",
    C=2.0
)
model.fit(X_train_res, y_train_res)

print("✅ Model training selesai dengan Random Oversampler dan Logistic Regression")

Before ROS : (173, 8000) → After ROS : (696, 8000)
✅ Model training selesai dengan Random Oversampler dan Logistic Regression


In [15]:
# Evaluasi

from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test_tfidf)

print("\n=== Classification Report ===\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\n=== Confusion Matrix ===\n")
print(confusion_matrix(y_test, y_pred))


=== Classification Report ===

                                    precision    recall  f1-score   support

             Aplikasi Web & Mobile       1.00      0.33      0.50         3
Computer Vision & Image Processing       1.00      1.00      1.00         1
          Data Science & Analytics       0.00      0.00      0.00         2
            IoT & Embedded Systems       0.00      0.00      0.00         1
Keamanan Informasi & Cybersecurity       1.00      1.00      1.00         1
             Machine Learning & AI       0.67      1.00      0.80        10
                 Network & Systems       1.00      1.00      1.00         1
                UI/UX Design & HCI       1.00      1.00      1.00         1

                          accuracy                           0.75        20
                         macro avg       0.71      0.67      0.66        20
                      weighted avg       0.68      0.75      0.68        20


=== Confusion Matrix ===

[[ 1  0  0  0  0  2  0  0]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
import os

# Pastikan folder models ada
os.makedirs("../models", exist_ok=True)

# Simpan model, vectorizer (TF-IDF), dan label encoder
joblib.dump(model, "../models/tfidf_logreg_model.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")
joblib.dump(le, "../models/label_encoder.pkl")

print("✅ Model, TF-IDF vectorizer, dan LabelEncoder berhasil disimpan!")

✅ Model, TF-IDF vectorizer, dan LabelEncoder berhasil disimpan!
