In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [9]:
# Step 1: Create the Sample Dataset
data = {
    'text': [
        "Mnatukana watu",  # Hate speech
        "Tuko pamoja katika amani",  # Non-hate speech
        "Ujinga wa mtu",  # Hate speech
        "Jambo la kawaida",  # Non-hate speech
        "Nyinyi ni wapumbavu",  # Hate speech
        "Hatuwezi kuishi pamoja",  # Hate speech
        "Ninapenda nchi yangu",  # Non-hate speech
        "Tuungane katika amani",  # Non-hate speech
        "Watu mnafanya vitu vibaya",  # Hate speech
        "Kila mtu anahitaji haki",  # Non-hate speech
    ],
    'label': [
        'hate',      # Label for hate speech
        'non-hate',  # Label for non-hate speech
        'hate',
        'non-hate',
        'hate',
        'hate',
        'non-hate',
        'non-hate',
        'hate',
        'non-hate',
    ]
}

In [10]:
# Creating a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame (Optional)
print(df)

                        text     label
0             Mnatukana watu      hate
1   Tuko pamoja katika amani  non-hate
2              Ujinga wa mtu      hate
3           Jambo la kawaida  non-hate
4        Nyinyi ni wapumbavu      hate
5     Hatuwezi kuishi pamoja      hate
6       Ninapenda nchi yangu  non-hate
7      Tuungane katika amani  non-hate
8  Watu mnafanya vitu vibaya      hate
9    Kila mtu anahitaji haki  non-hate


In [11]:
# Step 2: Preprocess the Data
df['text'] = df['text'].str.lower()  # Convert text to lowercase
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
df['text'] = df['text'].str.replace(r'\d+', '', regex=True)  # Remove numbers

In [12]:
# Step 3: Train-Test Split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Step 4: Feature Extraction (TF-IDF)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [14]:
# Step 5: Train the Model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [15]:
# Step 6: Evaluate the Model
y_pred = model.predict(X_test_tfidf)


In [16]:
# Display Accuracy and Classification Report
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

        hate       1.00      1.00      1.00         1
    non-hate       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [17]:
import joblib

# Save the model
joblib.dump(model, 'hate_speech_model.pkl')
# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']