 TF-IDF Vectorization for Amharic Sentiment Analysis

In [5]:


# Import required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


In [8]:
# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_amharic_sentiment.csv")
df.dropna(inplace=True)  # Just in case
texts = df["cleaned_tweets"].astype(str).tolist()
labels = df["sentiment"].tolist()


In [9]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words=None,  # You can add custom Amharic stop words later
)

# Fit and transform the text data
X = vectorizer.fit_transform(texts)


In [10]:
print("TF-IDF Matrix Shape:", X.shape)
print("Example features:", vectorizer.get_feature_names_out()[:10])


TF-IDF Matrix Shape: (6282, 5000)
Example features: ['aalexonline' 'abaaboraa' 'abaaboraa abiyahmedali' 'abakoran'
 'abbaacabsa' 'abbasheger' 'abbasheger nigigebi' 'abbasheger tseday'
 'abbasshash' 'abebeabebayehu']


In [11]:
# Save vectorized features (X) and labels (y) as pickle
with open("../data/processed/tfidf_features.pkl", "wb") as f:
    pickle.dump(X, f)

with open("../data/processed/labels.pkl", "wb") as f:
    pickle.dump(labels, f)

# Also save vectorizer itself to reuse during prediction
with open("../models/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("TF-IDF features and labels saved successfully.")


TF-IDF features and labels saved successfully.
