### utils

In [1]:
from transliterate import translit
import pandas as pd

KAZAKH_TO_RUS_REPLACEMENTS = {
    "Ә": "А",
    "ә": "а",
    "І": "И",
    "і": "и",
    "Ң": "Н",
    "ң": "н",
    "Ғ": "Г",
    "ғ": "г",
    "Ү": "У",
    "ү": "у",
    "Ұ": "У",
    "ұ": "у",
    "Ө": "О",
    "ө": "о",
    "Қ": "К",
    "қ": "к",
}


def kazakh_to_russian(text: str) -> str:
    return "".join(KAZAKH_TO_RUS_REPLACEMENTS.get(ch, ch) for ch in text)


def translit_text(text: str) -> str:
    return translit(text, "ru", reversed=True)


### data manipulations

In [2]:
train_df = pd.read_csv("train.csv")

kaz = train_df[train_df.label == "kaz"]
eng = train_df[train_df.label == "eng"]
ru = train_df[train_df.label == "ru"]

kaz_part1 = kaz.sample(frac=0.5, random_state=42)
kaz_part2 = kaz.drop(kaz_part1.index)
kaz_part1["text"] = kaz_part1.text.apply(kazakh_to_russian)
kaz_part1_1 = kaz_part1.sample(frac=0.5, random_state=42)
kaz_part1_2 = kaz_part1.drop(kaz_part1_1.index)
kaz_part1_1["text"] = kaz_part1_1.text.apply(translit_text)
kaz_part1 = pd.concat([kaz_part1_1, kaz_part1_2])
kaz = pd.concat([kaz_part1, kaz_part2])

ru_part1 = ru.sample(frac=0.5, random_state=42)
ru_part2 = ru.drop(ru_part1.index)
ru_part1["text"] = ru_part1.text.apply(translit_text)
ru = pd.concat([ru_part1, ru_part2])
train = pd.concat([kaz, eng, ru])



### train part

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

train_df = train.copy()
test = pd.read_csv("test.csv")
ground_truth = pd.read_csv("ground_truth.csv")

X_train, y_train = train_df["text"], train_df["label"]
X_test, y_test = test["text"], ground_truth["label"]

vectorizer = TfidfVectorizer(sublinear_tf=True, max_features=20_000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "C": [0.01, 0.1, 1, 10],
#     "penalty": ["l2", "l1"],
#     "solver": ["liblinear", "saga"],
#     "class_weight": [None, "balanced"]
# }

# grid = GridSearchCV(
#     LogisticRegression(max_iter=1000),
#     param_grid,
#     cv=3,
#     scoring="f1_macro",
#     n_jobs=-1
# )

# grid.fit(X_train_tfidf, y_train)
# print("Best params:", grid.best_params_)

best_lr = LogisticRegression(
    C=10,
    penalty="l2",
    solver="saga",
    max_iter=200,   
    random_state=42
)

best_lr.fit(X_train_tfidf, y_train)
y_pred_best = best_lr.predict(X_test_tfidf)

print("\n=== Final Logistic Regression ===")
print(classification_report(y_test, y_pred_best, zero_division=0))


=== Final Logistic Regression ===
              precision    recall  f1-score   support

         eng       0.91      0.97      0.94      1816
         kaz       0.99      0.98      0.98     11221
          ru       0.97      0.96      0.96      2278

    accuracy                           0.98     15315
   macro avg       0.96      0.97      0.96     15315
weighted avg       0.98      0.98      0.98     15315




## To Improve Score Further
	•	If you carefully analyze the dataset, you’ll notice a small subset of code-switched texts.
	•	For example, a text may be written in English (with English grammar) but contain words or even full phrases in Kazakh or Russian.
	•	The same pattern occurs in Russian and Kazakh texts.
#### Manually labeling these code-switched samples in the test set and adding them to the training set can improve the F1 score.

#### Additionally, you can try ensembling multiple models and feature types:
	•	Fine-tuned BERT variants (e.g., XLM-RoBERTa base/large, Kaz-RoBERTa-Conversational)
	•	SVM / Logistic Regression / CatBoost / LightGBM on TF-IDF features
	•	SVM / Logistic Regression / CatBoost / LightGBM on FastText features

