In [1]:
import pandas as pd
import re
import string
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [2]:
import catboost as ct

In [3]:
# Download stopwords and tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Load datasets
train_df = pd.read_csv("/kaggle/input/data-community-hmif-playground-ep-02/train.csv")
test_df = pd.read_csv("/kaggle/input/data-community-hmif-playground-ep-02/test.csv")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
train_df['label'].unique()

array(['Politik', 'Pertahanan dan Keamanan', 'Sosial Budaya',
       'Sumber Daya Alam', 'Ideologi', 'Ekonomi', 'Demografi', 'Geografi'],
      dtype=object)

In [5]:
train_df['label'].value_counts()

label
Politik                    2378
Sosial Budaya               470
Pertahanan dan Keamanan     320
Ideologi                    320
Ekonomi                     294
Sumber Daya Alam            153
Demografi                    49
Geografi                     16
Name: count, dtype: int64

In [6]:
# Text Cleaning Function
def clean_text(text):
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    # Remove mentions and hashtags
    text = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text)  # Tokenize
    words = [word for word in words if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(words)


In [7]:
# Apply cleaning
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

In [8]:
# Split dataset
X_train, X_val, y_train, y_val = train_test_split(train_df['clean_text'], train_df['label'], test_size=0.2, random_state=42)

In [9]:
# Create pipeline
import lightgbm as lgb
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
    ('clf', MultinomialNB())
])

In [10]:
# Train model
pipeline.fit(X_train, y_train)

In [11]:
# Validate model
y_pred = pipeline.predict(X_val)
f1 = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Score: {f1:.4f}')

Validation F1 Score: 0.3229


In [12]:
# Predict on test set
test_preds = pipeline.predict(test_df['clean_text']).flatten()

# Save submission
submission = pd.DataFrame({'ID': test_df['ID'], 'label': test_preds})
submission.to_csv('submission2.csv', index=False)
print("Submission file saved!")

Submission file saved!
