In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

In [3]:
#data loading
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
SampleSubmission=pd.read_csv('SampleSubmission.csv')

EDA

In [10]:
print(train.shape)
print(test.shape)
print(SampleSubmission.shape)

(5151, 3)
(1030, 2)
(1288, 6)


In [11]:
print(train.columns)

Index(['id', 'content', 'category'], dtype='object')


In [12]:
print(train.isnull().sum())

id          0
content     0
category    0
dtype: int64


In [17]:
#encoding the labels
le = LabelEncoder()
train["y"] = le.fit_transform(train["category"].astype(str))
label_names = list(le.classes_)
num_classes = len(label_names)

print("Classes found:", label_names)


Classes found: ['Biashara', 'Burudani', 'Kimataifa', 'Kitaifa', 'michezo']


In [19]:
import re
from html import unescape

#cleaning the test
def clean_text(text):
    if pd.isna(text):
        return ""
    s = str(text)
    s = unescape(s)
    s = s.lower()
    s = re.sub(r'https?://\S+|www\.\S+', ' ', s)   # remove URLs
    s = re.sub(r'<.*?>', ' ', s)                  # remove HTML
    s = re.sub(r'[^0-9A-Za-zÀ-ž\s\-\'’]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

train["clean"] = train["content"].map(clean_text)
test["clean"] = test["content"].map(clean_text)

In [20]:
#preparing data arrays
X = train["clean"].values
y = train["y"].values
X_test = test["clean"].values


In [21]:
#setting up the CV storage
import random, numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
N_SPLITS = 5
oof_proba = np.zeros((len(train), num_classes))
test_proba = np.zeros((len(test), num_classes))


In [40]:
#training the model
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n=== Fold {fold} ===")

    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # TF-IDF vectorizer
    vect = TfidfVectorizer(
        max_features=25000,
        ngram_range=(1,2),
        min_df=3,
        sublinear_tf=True
    )

    X_tr_tfidf = vect.fit_transform(X_tr)
    X_val_tfidf = vect.transform(X_val)
    X_test_tfidf = vect.transform(X_test)

    # Logistic Regression
    clf = LogisticRegression(
        multi_class='multinomial',
        solver='saga',
        max_iter=2000,
        class_weight='balanced',
        random_state=SEED
    )

    clf.fit(X_tr_tfidf, y_tr)

    # Predictions
    val_pred = clf.predict_proba(X_val_tfidf)
    test_pred = clf.predict_proba(X_test_tfidf)

    oof_proba[val_idx] = val_pred
    test_proba += test_pred / N_SPLITS

    fold_loss = log_loss(y_val, val_pred)
    print("Fold log loss:", fold_loss)




=== Fold 1 ===




Fold log loss: 9.702879725853691

=== Fold 2 ===




Fold log loss: 0.6411795807765583

=== Fold 3 ===




Fold log loss: 1.243644783621783

=== Fold 4 ===




Fold log loss: 0.7147276294939462

=== Fold 5 ===




Fold log loss: 0.6330912481457995




In [41]:
#calculating the OOF log loss
overall_loss = log_loss(y, oof_proba)
print("\nOverall OOF log loss:", overall_loss)



Overall OOF log loss: 2.588486029247599


In [42]:
submission = pd.DataFrame(test_proba, columns=label_names)
submission.insert(0, "id", test["swahili_id"])
submission.to_csv("submission.csv", index=False)

print("\nSaved submission.csv")
print(submission.head())


Saved submission.csv
                                         id  Biashara  Burudani  Kimataifa  \
0  ae3baa6c34aa523fd2aa4de3c89448efff922311  0.348461  0.867905   0.769175   
1  c4ee26a3ade8064a2ec494996e836900fd32dd8e  0.047647  0.708072   0.731491   
2  58aee3aa1d94554ff57e6a053dbd60658e4890ff  0.041941  0.663886   0.201260   
3  00579c2307b5c11003d21c40c3ecff5e922c3fd8  1.074578  0.828993   0.486262   
4  c83e9738ae5d1790ee85b99863deb734e7614c52  0.187928  0.786093   0.558815   

    Kitaifa   michezo  
0  1.082427  0.532032  
1  1.753993  0.358798  
2  0.150594  2.542319  
3  0.653441  0.556726  
4  1.779754  0.287410  


In [43]:
#computing the accuracy of the model
from sklearn.metrics import accuracy_score

# Converting probabilities to predicted class
oof_pred_labels = np.argmax(oof_proba, axis=1)

# Compute=ing accuracy
accuracy = accuracy_score(y, oof_pred_labels)

print("Accuracy:", accuracy)


Accuracy: 0.6557949912638322


In [44]:
from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>