 TF-IDF Vectorization for Amharic Sentiment Analysis and logical regression with train_test_split

In [11]:
# 02_modeling.ipynb

# -------------------------------
# 1. Imports
# -------------------------------
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import os

# -------------------------------
# 2. Load Cleaned Dataset
# -------------------------------
df = pd.read_csv("../data/processed/amharic_sentiment_cleaned.csv")

# Drop rows with missing values
df.dropna(subset=["cleaned_tweet", "label"], inplace=True)

texts = df["cleaned_tweet"].astype(str).tolist()
labels = df["label"].tolist()

# -------------------------------
# 3. TF-IDF Vectorization
# -------------------------------
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
)

X = vectorizer.fit_transform(texts)

# Save the TF-IDF matrix and labels
os.makedirs("../data/processed", exist_ok=True)
with open("../data/processed/tfidf_features.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/processed/labels.pkl", "wb") as f:
    pickle.dump(labels, f)

# Save the vectorizer itself for use in Streamlit
os.makedirs("../models", exist_ok=True)
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("TF-IDF vectorization complete. Features and vectorizer saved.")

# -------------------------------
# 4. Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

# -------------------------------
# 5. Train Logistic Regression Model
# -------------------------------
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Save trained model
with open("../models/logistic_model.pkl", "wb") as f:
    pickle.dump(clf, f)

# -------------------------------
# 6. Evaluate Model
# -------------------------------
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


TF-IDF vectorization complete. Features and vectorizer saved.
Classification Report:

              precision    recall  f1-score   support

           0       0.72      0.85      0.78       239
           1       0.78      0.61      0.68       206

    accuracy                           0.74       445
   macro avg       0.75      0.73      0.73       445
weighted avg       0.75      0.74      0.73       445

Confusion Matrix:

[[203  36]
 [ 80 126]]
