 TF-IDF Vectorization for Amharic Sentiment Analysis and logical regression with train_test_split

In [2]:
# -------------------------------
# 1. Imports
# -------------------------------
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import os

# -------------------------------
# 2. Load Cleaned Dataset
# -------------------------------
df = pd.read_csv("../data/processed/amharic_sentiment_cleaned.csv")
df.dropna(subset=["cleaned_tweet", "label"], inplace=True)

texts = df["cleaned_tweet"].astype(str).tolist()
labels = df["label"].tolist()

# -------------------------------
# 3. TF-IDF Vectorization
# -------------------------------
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
)

X = vectorizer.fit_transform(texts)

# Save the TF-IDF features and vectorizer
os.makedirs("../data/processed", exist_ok=True)
with open("../data/processed/tfidf_features.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/processed/labels.pkl", "wb") as f:
    pickle.dump(labels, f)

os.makedirs("../models", exist_ok=True)
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("TF-IDF vectorization complete. Features and vectorizer saved.")

# -------------------------------
# 4. Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

# -------------------------------
# 5. Train SGDClassifier Model
# -------------------------------
clf = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3)

# Convert labels to array if needed
import numpy as np
y_train = np.array(y_train)
y_test = np.array(y_test)

# Use partial_fit for initial training
clf.partial_fit(X_train, y_train, classes=np.unique(labels))

# Save model
with open("../models/sgd_model.pkl", "wb") as f:
    pickle.dump(clf, f)

# -------------------------------
# 6. Evaluate Model
# -------------------------------
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


TF-IDF vectorization complete. Features and vectorizer saved.
Classification Report:

              precision    recall  f1-score   support

           0       0.77      0.74      0.75       238
           1       0.71      0.75      0.73       206

    accuracy                           0.74       444
   macro avg       0.74      0.74      0.74       444
weighted avg       0.74      0.74      0.74       444

Confusion Matrix:

[[175  63]
 [ 51 155]]


In [3]:
def convert_to_fasttext_format(df, text_col, label_col, out_path):
    with open(out_path, 'w', encoding='utf-8') as f:
        for text, label in zip(df[text_col], df[label_col]):
            label_str = "__label__positive" if label == 1 else "__label__negative"
            f.write(f"{label_str} {text.strip()}\n")

convert_to_fasttext_format(df, "cleaned_tweet", "label", "fasttext_train.txt")


In [7]:
%pip install fasttext

import fasttext

model = fasttext.train_supervised(input="fasttext_train.txt", epoch=25, lr=1.0, wordNgrams=2, verbose=2)
model.save_model("../models/amharic_fasttext_model.ftz")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Read 0M words
Number of words:  15763
Number of labels: 2
Progress: 100.0% words/sec/thread:  316896 lr:  0.000000 avg.loss:  0.039308 ETA:   0h 0m 0s 0m 0s


In [10]:
result = model.test("fasttext_train.txt")
nexamples, precision, recall = result
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")


Precision: 1.00, Recall: 1.00
