In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words) if words else "empty"

dataset = pd.read_csv("Hate_Speech_Detection_Dataset.csv")
dataset = dataset[['tweet', 'class']]
dataset = dataset.dropna()
dataset['clean_text'] = dataset['tweet'].apply(clean_text)
dataset = dataset[dataset['clean_text'] != "empty"]

X_train, X_test, y_train, y_test = train_test_split(dataset['clean_text'], dataset['class'], test_size=0.2, random_state=42, stratify=dataset['class'])

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

smote = SMOTE()
X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)

model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

def detect_hate_speech(text):
    processed_text = clean_text(text)
    text_tfidf = vectorizer.transform([processed_text])
    prediction = model.predict(text_tfidf)[0]
    labels = {0: "Hate Speech", 1: "Offensive Language", 2: "Not Offensive"}
    return labels[prediction]

while True:
    user_input = input("Enter a text (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    print("Prediction:", detect_hate_speech(user_input))


In [None]:
dataset.isnull().sum()

In [None]:
dataset.info()

In [None]:
dataset.describe()
