In [182]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [183]:
data = pd.read_csv("hate_Data.csv")

In [184]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [185]:
data['tweet'] = data['tweet'].apply(clean_text)
data['class'] = data['class'].astype(int)


In [186]:
print(data.head())

   count  hate_speech  offensive_language  neither  class  \
0      3            0                   0        3      2   
1      3            0                   3        0      1   
2      3            0                   3        0      1   
3      3            0                   2        1      1   
4      6            0                   6        0      1   

                                               tweet  
0  rt as a woman you shouldnt complain about clea...  
1  rt boy dats coldtyga dwn bad for cuffin dat ho...  
2  rt dawg rt you ever fuck a bitch and she start...  
3                          rt she look like a tranny  
4  rt the shit you hear about me might be true or...  


In [187]:
vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(data['tweet'])
y = data['class']

In [188]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100)

print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Neither']))


Accuracy: 84.14363526326407
                    precision    recall  f1-score   support

       Hate Speech       0.30      0.60      0.39       290
Offensive Language       0.97      0.84      0.90      3832
           Neither       0.74      0.91      0.82       835

          accuracy                           0.84      4957
         macro avg       0.67      0.78      0.71      4957
      weighted avg       0.89      0.84      0.86      4957



In [189]:
pred_val = {0:"Hate Speech",1:"Offensive Language",2:"Neither"}

In [190]:
def predict_hate_speech(tweet):
    cleaned_tweet = clean_text(tweet)
    vectorized_tweet = vectorizer.transform([cleaned_tweet])
    prediction = model.predict(vectorized_tweet)[0]
    # return class_mapping[prediction]
    return pred_val[prediction]

In [191]:
# while True:
#     user_input = input("Enter text to check (or 'end' to exit): ")
#     if user_input.lower() == 'end':
#         break
#     result = predict_hate_speech(user_input)
#     print(f"Prediction: {result}\n")

Prediction: Neither



In [192]:
# # # List of test comments
# test_comments = [
#     # Hate Speech Examples
#     "I can't stand people with black skin! They should all disappear.",
#     "You're worthless because of your race. Go back to where you came from.",
#     "All muslims are terrorists and should be banned.",
    
#     # Offensive Language Examples
#     "You're such an idiot, you can't even do anything right!",
#     "Shut up, you moron. Nobody cares about your opinion.",
#     "This project is shit and you are dumb!",
    
#     # Neutral Comments
#     "My name is Aditya, and I love programming.",
#     "This is a great day to go for a walk!",
#     "I think this topic is very interesting and deserves more attention."
# ]

# # Check each comment and print the prediction
# print("Testing Hate Speech Detection:")
# for comment in test_comments:
#     prediction = predict_hate_speech(comment)
#     print(f"Comment: {comment}")
#     print(f"Prediction: {prediction}")
#     print("-" * 50)


Testing Hate Speech Detection:
Comment: I can't stand people with black skin! They should all disappear.
Prediction: Hate Speech
--------------------------------------------------
Comment: You're worthless because of your race. Go back to where you came from.
Prediction: Hate Speech
--------------------------------------------------
Comment: All muslims are terrorists and should be banned.
Prediction: Hate Speech
--------------------------------------------------
Comment: You're such an idiot, you can't even do anything right!
Prediction: Hate Speech
--------------------------------------------------
Comment: Shut up, you moron. Nobody cares about your opinion.
Prediction: Neither
--------------------------------------------------
Comment: This project is shit and you are dumb!
Prediction: Offensive Language
--------------------------------------------------
Comment: My name is Aditya, and I love programming.
Prediction: Neither
--------------------------------------------------
Commen

In [193]:
import pickle
with open("hate_speech_model.pkl","wb") as model_file:
    pickle.dump(model,model_file)
    
with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)