 TF-IDF Vectorization for Amharic Sentiment Analysis and logical regression with train_test_split

In [2]:
# 02_modeling.ipynb

# -------------------------------
# 1. Imports
# -------------------------------
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import os

# -------------------------------
# 2. Load Cleaned Dataset
# -------------------------------
df = pd.read_csv("../data/processed/cleaned_amharic_sentiment.csv")

# Drop rows with missing values
df.dropna(subset=["cleaned_tweets", "sentiment"], inplace=True)

texts = df["cleaned_tweets"].astype(str).tolist()
labels = df["sentiment"].tolist()

# -------------------------------
# 3. TF-IDF Vectorization
# -------------------------------
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
)

X = vectorizer.fit_transform(texts)

# Save the TF-IDF matrix and labels
os.makedirs("../data/processed", exist_ok=True)
with open("../data/processed/tfidf_features.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/processed/labels.pkl", "wb") as f:
    pickle.dump(labels, f)

# Save the vectorizer itself for use in Streamlit
os.makedirs("../models", exist_ok=True)
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("TF-IDF vectorization complete. Features and vectorizer saved.")

# -------------------------------
# 4. Train/Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

# -------------------------------
# 5. Train Logistic Regression Model
# -------------------------------
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Save trained model
with open("../models/logistic_model.pkl", "wb") as f:
    pickle.dump(clf, f)

# -------------------------------
# 6. Evaluate Model
# -------------------------------
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


TF-IDF vectorization complete. Features and vectorizer saved.
Classification Report:

              precision    recall  f1-score   support

       mixed       0.00      0.00      0.00       102
    negative       0.43      0.22      0.29       299
     neutral       0.53      0.86      0.65       598
    positive       0.58      0.29      0.39       258

    accuracy                           0.52      1257
   macro avg       0.38      0.34      0.33      1257
weighted avg       0.47      0.52      0.46      1257

Confusion Matrix:

[[  0  14  79   9]
 [  2  67 215  15]
 [  0  54 512  32]
 [  0  21 161  76]]
