# 1. Problem Information
- **Name:** [**Text Classification into Thematic Categories**](https://platform.olimpiada-ai.ro/en/problems/29)
- **Date:** 12/02/2026
- **Type:** NLP

# 2. Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.naive_bayes import ComplementNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# 3. Data preparation

In [None]:
le = LabelEncoder()

train = pd.read_csv("data/ignore/train.csv")
test = pd.read_csv("data/ignore/test.csv")
train['label'] = le.fit_transform(train['label'])

print(train.shape)
train.head(5)

(56727, 3)


Unnamed: 0,SampleID,text,label
0,139768,Take a Presence Power Break (The New Coffee Br...,2
1,297,Yolanda Hadid Returns To Social Media After 9-...,0
2,2274,"Democrats Want Paid Sick Days, Breaks For Dome...",1
3,106057,Taylor Swift Calls Out Sexist Critics Again,0
4,56920,Trump Surrogate Rudy Giuliani: 'Anything's Leg...,1


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56727 entries, 0 to 56726
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   SampleID  56727 non-null  int64 
 1   text      56727 non-null  object
 2   label     56727 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


# 4. Models

In [4]:
X = train['text']
Y = train['label']
pipeline = make_pipeline(TfidfVectorizer(lowercase=True,min_df=3,max_df=0.8,stop_words='english'),ComplementNB())
scores = cross_val_score(pipeline,X,Y,cv=3)
print(scores.mean())

0.9345461596770498


In [5]:
pipeline.fit(X,Y)
prediction = pipeline.predict(test['text'])

# 5. Submission

In [6]:
submission = pd.DataFrame({
    "SampleID": test['SampleID'],
    "label": le.inverse_transform(prediction)
})

submission.head()

Unnamed: 0,SampleID,label
0,103811,ENTERTAINMENT
1,207637,WELLNESS
2,151717,WELLNESS
3,27926,POLITICS
4,115640,POLITICS


In [7]:
submission.to_csv("submission.csv", index=False)