In [1]:
!pip -q install scikit-learn pandas numpy


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline


In [3]:
DATA_DIR = "/content/init_data"

train = pd.read_csv(f"{DATA_DIR}/train_chat.csv")
test  = pd.read_csv(f"{DATA_DIR}/test_chat.csv")
sample = pd.read_csv(f"{DATA_DIR}/sample_submission_chat.csv")

train.head()


Unnamed: 0,message_id,text,day,hour,topic
0,msg_000001,–¢–°–ñ –ø–æ–¥–Ω—è–ª–∏ —Ç–∞—Ä–∏—Ñ? –≤ –ø–ª–∞—Ç—ë–∂–∫–µ +300‚ÇΩ üòÅ,14,0,bills
1,msg_000002,"–°–æ—Å–µ–¥–∏ –ø–æ –¥–æ–º—É, –æ—Ç–¥–∞–º –¥–∞—Ä–æ–º —á–∞–π–Ω–∏–∫, –≤–¥—Ä—É–≥ –∫–æ–º—É...",11,19,market
2,msg_000003,"–ú–∏—à–∞, —Ç—ã –¥–µ–ª–∞–ª –±–ª–∏–Ω—ã? –∫–∞–∫ –ø–æ–ª—É—á–∏–ª–æ—Å—å?",9,20,cooking
3,msg_000004,"–Ω–∞—à–ª–∏ –æ—à–µ–π–Ω–∏–∫ —É –ø–æ—á—Ç–æ–≤—ã—Ö —è—â–∏–∫–æ–≤, –±–µ–∑ –∞–¥—Ä–µ—Å–Ω–∏–∫–∞...",8,14,pets
4,msg_000005,"–°–∞–ª—é—Ç, –≤ –ø–æ–¥—ä–µ–∑–¥–µ –º–∏–≥–∞–µ—Ç —Å–≤–µ—Ç —É –º—É—Å–æ—Ä–∫–∏, –∫–∞–∫ –Ω...",10,21,repairs


In [4]:
def build_text(df):
    return (
        df["text"].fillna("") +
        " <DAY_" + df["day"].astype(str) + ">" +
        " <HOUR_" + df["hour"].astype(str) + ">"
    )

train["full_text"] = build_text(train)
test["full_text"]  = build_text(test)


In [5]:
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.9,
    analyzer="word",
    sublinear_tf=True
)

clf = LogisticRegression(
    C=6,
    max_iter=3000,
    class_weight="balanced",
    n_jobs=-1
)

pipe = Pipeline([
    ("tfidf", tfidf),
    ("clf", clf)
])


In [6]:
X = train["full_text"]
y = train["topic"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_val)

    f1 = f1_score(y_val, preds, average="macro")
    scores.append(f1)

    print(f"Fold {fold}: Macro-F1 = {f1:.4f}")

print("Mean Macro-F1:", np.mean(scores))


Fold 1: Macro-F1 = 0.9999
Fold 2: Macro-F1 = 0.9998
Fold 3: Macro-F1 = 1.0000
Fold 4: Macro-F1 = 1.0000
Fold 5: Macro-F1 = 0.9999
Mean Macro-F1: 0.9999250849735504


In [7]:
pipe.fit(train["full_text"], train["topic"])


In [8]:
test_preds = pipe.predict(test["full_text"])

submission = pd.DataFrame({
    "message_id": test["message_id"],
    "topic": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,message_id,topic
0,msg_120001,congratulations
1,msg_120002,bills
2,msg_120003,cleaning
3,msg_120004,plans_guests
4,msg_120005,lost_found
