In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/amharic-news-corpus-merged/Amharic_corpus_merged_2023-04-16.csv
/kaggle/input/amharic-news-corpus-merged/consolidated_stop_words_2023.csv


In [2]:
import pandas as pd

# Path to the main corpus
corpus_path = '/kaggle/input/amharic-news-corpus-merged/Amharic_corpus_merged_2023-04-16.csv'
df = pd.read_csv(corpus_path)#reads to panda dataframe

# Preview
print(df.head())
print(df.columns)


                                             article       category
0  አባስ ሁሴን በዋሺንግተን ዲሲ እና አካባቢዎ ከ20 ዓመት በላይ የኖረ ትው...     Local News
1  አዲስ አበባ፣ ጥቅምት 13፣ 2013 (ኤፍ.ቢ.ሲ) በተያዘው በጀት ዓመት ...     Local News
2  የተወለደው አዲስ አበባ ካሳንችስ አካባቢ ነው። ከጅማ ዩኒቨርስቲ በኢኮኖሚ...  Entertainment
3  ነባሩን የሕወሓት አመራር አባል አቶ ብርሃነ ኪዳነ ማርያምን ከሁለት ዓመት...       Politics
4  ድሬዳዋ ከተማ ሶስት ተጫዋቾችን በማሰናበቱ ፌድሬሽኑ ያለ አግባብ ነው ውሳ...         Sports
Index(['article', 'category'], dtype='object')


Stopwords are high-frequency, low-meaning words like "the," "is," and "and" that are filtered out in natural language processing (NLP) to improve the efficiency and accuracy of analysis.

These appear in every sentence but do not help identify the category (Politics, Sports, Local News, etc.).

In [3]:
stopwords_path = '/kaggle/input/amharic-news-corpus-merged/consolidated_stop_words_2023.csv'
stopwords_df = pd.read_csv(stopwords_path)

# Check the column names
print(stopwords_df.columns)

# Preview first few rows
print(stopwords_df.head())


Index(['original_text'], dtype='object')
  original_text
0         ይኖረዋል
1        እላችኋለሁ
2        አጠናቀዋል
3        አልቀረበም
4            ሆይ


**Step 1: Preprocess the text**

Remove punctuation, lowercase, and optionally remove stopwords

In [4]:
import pandas as pd
import re

# Load stopwords
stopwords_path = '/kaggle/input/amharic-news-corpus-merged/consolidated_stop_words_2023.csv'
stopwords_df = pd.read_csv(stopwords_path)

stopwords = set(stopwords_df['original_text'].tolist())
print(f"Loaded {len(stopwords)} stopwords")

# Preprocessing function
def preprocess(text):
    text = str(text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])#Removes stopwords
    return text

# Apply preprocessing
df['text_clean'] = df['article'].apply(preprocess)
print(df[['article', 'text_clean']].head())


Loaded 714 stopwords
                                             article  \
0  አባስ ሁሴን በዋሺንግተን ዲሲ እና አካባቢዎ ከ20 ዓመት በላይ የኖረ ትው...   
1  አዲስ አበባ፣ ጥቅምት 13፣ 2013 (ኤፍ.ቢ.ሲ) በተያዘው በጀት ዓመት ...   
2  የተወለደው አዲስ አበባ ካሳንችስ አካባቢ ነው። ከጅማ ዩኒቨርስቲ በኢኮኖሚ...   
3  ነባሩን የሕወሓት አመራር አባል አቶ ብርሃነ ኪዳነ ማርያምን ከሁለት ዓመት...   
4  ድሬዳዋ ከተማ ሶስት ተጫዋቾችን በማሰናበቱ ፌድሬሽኑ ያለ አግባብ ነው ውሳ...   

                                          text_clean  
0  አባስ ሁሴን በዋሺንግተን ዲሲ አካባቢዎ ከ20 ዓመት የኖረ ትውልደ ኢትዮጵ...  
1  አዲስ አበባ ጥቅምት 13 2013 ኤፍቢሲ በተያዘው በጀት ዓመት የመጀመሪያ...  
2  የተወለደው አዲስ አበባ ካሳንችስ አካባቢ ከጅማ ዩኒቨርስቲ በኢኮኖሚክስ የ...  
3  ነባሩን የሕወሓት አመራር አባል ብርሃነ ኪዳነ ማርያምን ከሁለት ዓመት በመ...  
4  ድሬዳዋ ከተማ ሶስት ተጫዋቾችን በማሰናበቱ ፌድሬሽኑ አግባብ ውሳኔው በሚል...  


**Step 2 : Label Encoding**


converts text categories into numbers so the machine learning model can understand and learn from them.
Because ML models cannot process text labels like:

"Sports" must in numeric form like 0

"Politics" must in numeric form like 1

"Local News" must in numeric form like 2

In [5]:
from sklearn.preprocessing import LabelEncoder

# Create label encoder object
le = LabelEncoder() #converts text labels into numbers


df['label'] = le.fit_transform(df['category'])
    #does 2 things
    #1)fit() Reads all categories and learns unique classes
    #2)transform() Converts each category to a number



# Preview category → Shows how each category is converted into a number
print(df[['category', 'label']].head())

# Show complete mapping
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_mapping)


        category  label
0     Local News      3
1     Local News      3
2  Entertainment      1
3       Politics      5
4         Sports      6
Label Mapping: {'Business': 0, 'Entertainment': 1, 'International News': 2, 'Local News': 3, 'Others': 4, 'Politics': 5, 'Sports': 6}


**Step 3 — Spliting Data**

In [6]:
from sklearn.model_selection import train_test_split

# Split into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text_clean'],         # passing the existing columns to the function so it can split them
    df['label'],           
    test_size=0.2,     
    random_state=42,          # fixed seed for reproducible splits
)


print("Train size:", len(train_texts), "Validation size:", len(val_texts))


Train size: 49532 Validation size: 12383


**Step 4: Tokenize for TF-IDF + Logistic Regression**


Breaking text into smaller pieces called tokens

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit on training text and transform both train & val
X_train_tfidf = tfidf.fit_transform(train_texts)
X_val_tfidf   = tfidf.transform(val_texts)

print("Train TF-IDF shape:", X_train_tfidf.shape)
print("Val TF-IDF shape:", X_val_tfidf.shape)



Train TF-IDF shape: (49532, 5000)
Val TF-IDF shape: (12383, 5000)


**Step 5: Train the Classifier**

Train the Logistic Regression model using the TF-IDF features extracted from the training texts. This step teaches the model how to associate words with labels.

In [8]:
from sklearn.linear_model import LogisticRegression

# Initialize the classifier
clf = LogisticRegression(max_iter=2000)

# Train on TF-IDF vectors
clf.fit(X_train_tfidf, train_labels)


**Step 6: Make Predictions on Validation Set**


Use the trained model to predict labels for the validation set. This helps check how well the model generalizes to unseen data.

In [9]:
preds = clf.predict(X_val_tfidf)


**Step 7: Evaluate Performance**

Measure the model’s performance using metrics like accuracy, precision, recall, and F1-score. This quantifies how well the classifier is performing on each class.

In [10]:
from sklearn.metrics import classification_report

print(classification_report(val_labels, preds, zero_division=0))


              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1845
           1       0.88      0.50      0.63       119
           2       0.84      0.79      0.81      1278
           3       0.81      0.84      0.82      4226
           4       0.00      0.00      0.00         1
           5       0.80      0.81      0.81      2837
           6       0.98      0.97      0.97      2077

    accuracy                           0.84     12383
   macro avg       0.73      0.67      0.70     12383
weighted avg       0.84      0.84      0.84     12383



**Step 8: Testing the Model**

Apply the trained model to new or external test data to assess real-world performance. Transform the test texts with the same TF-IDF vectorizer before predicting labels.

In [11]:
# Transform validation or new texts
X_test_tfidf = tfidf.transform(val_texts)  # or new_texts if you have a new dataset

# Predict
test_preds = clf.predict(X_test_tfidf)

# Evaluate
print(classification_report(val_labels, test_preds, zero_division=0))


              precision    recall  f1-score   support

           0       0.83      0.80      0.82      1845
           1       0.88      0.50      0.63       119
           2       0.84      0.79      0.81      1278
           3       0.81      0.84      0.82      4226
           4       0.00      0.00      0.00         1
           5       0.80      0.81      0.81      2837
           6       0.98      0.97      0.97      2077

    accuracy                           0.84     12383
   macro avg       0.73      0.67      0.70     12383
weighted avg       0.84      0.84      0.84     12383

