<a href="https://colab.research.google.com/github/Anantika0410/Hate-Text-Classification/blob/main/Hate_Speech_Classifier_Final_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing & Importing Required Libraries

joblib is used to save and load trained models.

Other imports like pandas, nltk, sklearn, etc., are required for:

Reading data

Cleaning text

NLP operations

Building and evaluating ML models

In [None]:
!pip install -q joblib

import pandas as pd
import numpy as np
import re
import nltk
import joblib

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


# Uploading the Dataset
It contains tweets labeled as:

0 = Hate Speech

1 = Offensive Language

2 = Neither

In [None]:
from google.colab import files
uploaded = files.upload()


Saving hate_speech.csv to hate_speech.csv


In [None]:
import pandas as pd
df = pd.read_csv('hate_speech.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Downloading NLTK Resources
Stopwords = Common words (like "the", "is") that are removed.

WordNet Lemmatizer = Reduces words to their base form (e.g., "running" → "run")




In [None]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Text Preprocessing Function and Cleaning the Dataset
Converts text to lowercase

Removes URLs, mentions, hashtags, special characters

Removes stopwords

Lemmatizes words

*This step cleans up the tweet so the model can understand its meaning better*

-Applies the clean_text function to every tweet in the dataset.

-Stores the cleaned version in a new column called cleaned_text.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):  #Text Preprocessing
    text = str(text).lower()
    text = re.sub(r"http\S+|@\S+|#\S+|[^a-z\s]", '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df['cleaned_text'] = df['tweet'].apply(clean_text) #Cleaning Dataset
df.head()


Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,cleaned_text
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,rt woman shouldnt complain cleaning house amp ...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,rt boy dat coldtyga dwn bad cuffin dat hoe st ...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,rt dawg rt ever fuck bitch start cry confused ...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,rt look like tranny
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,rt shit hear might true might faker bitch told ya


# Feature Extraction with TF-IDF

TF-IDF (Term Frequency–Inverse Document Frequency) converts text into numerical values.

It gives importance to words based on their frequency across tweets.

max_features=5000: only keep the top 5000 most important words

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['class']  #The target output: Hate speech (0), Offensive (1), or Neither (2)


# Train-Test Split

In [None]:

X = df["tweet"]
y = df["class"]

# Perform 80/20 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


total = len(X)
train_len = len(X_train)
test_len = len(X_test)

print("X_train sample:", X_train.head().tolist())
print("X_test sample:", X_test.head().tolist())
print("y_train sample:", y_train.head().tolist())
print("y_test sample:", y_test.head().tolist())

print(f"\nTotal samples: {total}")
print(f"Training samples: {train_len} ({(train_len / total) * 100:.2f}%)")
print(f"Testing samples: {test_len} ({(test_len / total) * 100:.2f}%)")


X_train sample: ['RT @FunSizedYogi: @TheBlackVoice well how else will white ppl get us to forget our horrific past other than to paint a pretty picture of ho&#8230;', "Funny thing is....it's not just the people doing it. It's the people who seeing these pics and judging the birds. Just as wrong.", 'RT @winkSOSA: "@AintShitSweet__: "@Rakwon_OGOD: Nigga messed with the wrong bitch &#128557;&#128514;https://t.co/5mNXKVAYot" &#128557;&#128557;&#128557;&#128557;&#128514;&#128514;&#128557;&#128557;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;&#128514;"@Th_Real_Esco', '@Jbrendaro30 @ZGabrail @ramsin1995 @GabeEli8 @Jacob2times bitch ass nigggaaa', 'S/o that real bitch']
X_test sample: ['934 8616\ni got a missed call from yo bitch', 'RT @KINGTUNCHI_: Fucking with a bad bitch you gone need some money lil homie!', "RT @eanahS__: @1inkkofrosess lol my credit ain't no where near good , but I know the right man for the job .. that ho nice though!", "RT @Maxin_Betha Wipe the cum out of them 

# Training 3 Machine Learning Models

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Support Vector Machine': LinearSVC()
}

for name, model in models.items():
    print(f"\n {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(classification_report(y_test, preds))



 Logistic Regression
              precision    recall  f1-score   support

           0       0.52      0.16      0.24       290
           1       0.91      0.96      0.94      3832
           2       0.83      0.83      0.83       835

    accuracy                           0.89      4957
   macro avg       0.75      0.65      0.67      4957
weighted avg       0.88      0.89      0.88      4957


 Naive Bayes
              precision    recall  f1-score   support

           0       1.00      0.00      0.01       290
           1       0.83      0.99      0.90      3832
           2       0.89      0.38      0.53       835

    accuracy                           0.83      4957
   macro avg       0.91      0.46      0.48      4957
weighted avg       0.85      0.83      0.79      4957


 Support Vector Machine
              precision    recall  f1-score   support

           0       0.44      0.22      0.29       290
           1       0.92      0.95      0.93      3832
           2  

# Saving the Best Model & Vectorizer

In [None]:
joblib.dump(models['Logistic Regression'], 'model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']

# Prediction Function

In [None]:
def predict_hate(text):
    model = joblib.load('model.pkl')
    vec = joblib.load('vectorizer.pkl')
    cleaned = clean_text(text)
    transformed = vec.transform([cleaned])
    return model.predict(transformed)[0]

predict_hate("I hate you and your kind  !")



np.int64(1)