In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


# =========================
# 1) Load Data
# =========================
train_path = "/kaggle/input/comment-category-prediction-challenge/train.csv"
test_path  = "/kaggle/input/comment-category-prediction-challenge/test.csv"

train = pd.read_csv(train_path)
test  = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape :", test.shape)
display(train.head())


# =========================
# 2) Basic Cleaning
# =========================
train = train.replace("none", np.nan)
test  = test.replace("none", np.nan)

for df in [train, test]:
    df["created_date"] = pd.to_datetime(df["created_date"], errors="coerce")

def add_datetime_features(df):
    df["hour"] = df["created_date"].dt.hour
    df["dayofweek"] = df["created_date"].dt.dayofweek
    df["month"] = df["created_date"].dt.month
    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int)
    return df

train = add_datetime_features(train)
test  = add_datetime_features(test)

def add_vote_features(df):
    df["upvote"] = pd.to_numeric(df["upvote"], errors="coerce")
    df["downvote"] = pd.to_numeric(df["downvote"], errors="coerce")

    df["score"] = df["upvote"].fillna(0) - df["downvote"].fillna(0)
    df["total_votes"] = df["upvote"].fillna(0) + df["downvote"].fillna(0)
    df["upvote_ratio"] = df["upvote"].fillna(0) / (df["total_votes"] + 1)
    return df

train = add_vote_features(train)
test  = add_vote_features(test)

train["comment"] = train["comment"].fillna("")
test["comment"] = test["comment"].fillna("")


# =========================
# 3) Split X / y
# =========================
y = train["label"].astype(int)
X = train.drop(columns=["label"])

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# =========================
# 4) Columns Setup
# =========================
text_col = "comment"

# PURE numeric only
numeric_cols = [
    "emoticon_1", "emoticon_2", "emoticon_3",
    "upvote", "downvote",
    "if_1", "if_2",
    "hour", "dayofweek", "month", "is_weekend",
    "score", "total_votes", "upvote_ratio"
]

# Categorical columns (IMPORTANT FIX)
categorical_cols = ["post_id", "race", "religion", "gender", "disability"]

# keep only columns that exist
numeric_cols = [c for c in numeric_cols if c in X.columns]
categorical_cols = [c for c in categorical_cols if c in X.columns]


# =========================
# 5) Preprocessing
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# text_transformer = Pipeline(steps=[
#     ("tfidf", TfidfVectorizer(
#         lowercase=True,
#         stop_words="english",
#         ngram_range=(1, 2),
#         min_df=2,
#         max_features=200000
#     ))
# ])
text_transformer = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 1),   # faster
        min_df=5,             # faster
        max_features=80000    # faster
    ))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, text_col),
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ],
    remainder="drop"
)


# =========================
# 6) Model
# =========================
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(
        max_iter=3000,
        C=4.0,
        solver="lbfgs",
        n_jobs=-1
    ))
])


# =========================
# 7) Train + Validate
# =========================
clf.fit(X_train, y_train)

val_preds = clf.predict(X_val)

acc = accuracy_score(y_val, val_preds)
f1m = f1_score(y_val, val_preds, average="macro")

print("Validation Accuracy:", acc)
print("Validation Macro F1 :", f1m)
print("\nClassification Report:\n")
print(classification_report(y_val, val_preds))


# =========================
# 8) Train Full Data & Predict Test
# =========================
clf.fit(X, y)
test_preds = clf.predict(test)


# =========================
# 9) Submission (Direct)
# =========================
submission = pd.DataFrame({
    "ID": np.arange(1, len(test_preds) + 1),
    "label": test_preds
})

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
display(submission.head(10))


Train shape: (198000, 15)
Test shape : (102000, 14)


Unnamed: 0,created_date,post_id,emoticon_1,emoticon_2,emoticon_3,upvote,downvote,if_1,if_2,race,religion,gender,disability,comment,label
0,2024-01-18 08:43:57.397508+00:00,73,0,0,0,0,1,0,10,,,,False,She might be a bright spot for a party keou on...,2
1,2024-03-24 21:43:11.490017+00:00,39,0,0,0,6,0,0,4,,,,False,"Under Alaska law, a non-tribal member is not b...",0
2,2024-04-24 20:32:17.014931+00:00,31,0,1,1,0,0,0,10,,,,False,in the future please spare me your strawman dr...,2
3,2023-05-28 22:00:14.214527+00:00,39,0,0,0,5,0,0,10,,,,False,"PS: That should have been ""rot"" instead of ""co...",2
4,2023-09-09 23:12:05.689498+00:00,39,0,0,0,0,0,0,10,,,,False,"Today, the confederate flag...tomorrow, the na...",2


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.906010101010101
Validation Macro F1 : 0.7880512389899961

Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.95      0.95     22835
           1       0.79      0.72      0.75      3183
           2       0.85      0.92      0.88     12488
           3       0.74      0.46      0.56      1094

    accuracy                           0.91     39600
   macro avg       0.83      0.76      0.79     39600
weighted avg       0.91      0.91      0.90     39600



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Saved submission.csv


Unnamed: 0,ID,label
0,1,2
1,2,2
2,3,0
3,4,0
4,5,2
5,6,0
6,7,1
7,8,0
8,9,0
9,10,2
