In [1]:
# Data Reference Classifier for Make Data Count Competition

import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [2]:
# Download stopwords
nltk.download('stopwords')

# === Step 1: Load Dataset ===
data_dir = Path("E:/UMB/Semester 6/DML/UAS/Dataset")
train_labels = pd.read_csv(data_dir / "train_labels.csv")
sample_submission = pd.read_csv(data_dir / "sample_submission.csv")
train_xml_dir = data_dir / "train/XML"
test_xml_dir = data_dir / "test/XML"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Ekstrak teks dari XML
def extract_text_from_xml(xml_path):
    with open(xml_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'xml')
    body = soup.find('body')
    return body.get_text(separator=' ', strip=True) if body else ""

In [4]:
# Preprocessing teks dasar
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

In [5]:
# Bangun dataset berdasarkan article_id dan label type
samples = []
for _, row in train_labels.iterrows():
    article_id = row['article_id']
    label = row['type'].lower()
    if label not in ['primary', 'secondary']:
        continue  # abaikan Missing
    xml_file = train_xml_dir / f"{article_id}.xml"
    if not xml_file.exists():
        continue
    text = extract_text_from_xml(xml_file)
    text = preprocess(text)
    samples.append((text, label))

train_df = pd.DataFrame(samples, columns=['text', 'label'])

In [6]:
# Train-test split dan pipeline TFIDF + Logistic Regression
X = train_df['text']
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))

joblib.dump(model, data_dir / "model.joblib")

              precision    recall  f1-score   support

     primary       0.84      0.93      0.88        44
   secondary       0.96      0.91      0.93        85

    accuracy                           0.91       129
   macro avg       0.90      0.92      0.91       129
weighted avg       0.92      0.91      0.92       129



['E:\\UMB\\Semester 6\\DML\\UAS\\Dataset\\model.joblib']

In [7]:
# === Step 6: Inference on Test Set ===
test_predictions = []
model = joblib.load("model.joblib")

for xml_file in test_xml_dir.glob("*.xml"):
    doc_id = xml_file.stem
    full_text = extract_text_from_xml(xml_file)
    # Dummy chunking: sliding window (can be improved)
    chunk_size = 30
    stride = 15
    predictions = []
    for i in range(0, len(full_text) - chunk_size, stride):
        chunk = full_text[i:i+chunk_size]
        preprocessed = preprocess(chunk)
        if not preprocessed.strip():
            continue
        pred = model.predict([preprocessed])[0]
        predictions.append(f"{pred} {i} {i+chunk_size}")
    prediction_string = "|".join(predictions)
    test_predictions.append((doc_id, prediction_string))

submission_df = pd.DataFrame(test_predictions, columns=['Id', 'PredictionString'])
submission_df.to_csv("submission.csv", index=False)
print("Submission saved as submission.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'model.joblib'