In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv("hate_Data.csv")

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [4]:
data['tweet'] = data['tweet'].apply(clean_text)
data['class'] = data['class'].astype(int)


In [5]:
vectorizer = TfidfVectorizer(max_features=5000,ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(data['tweet'])
y = data['class']



In [None]:

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
model = LogisticRegression(class_weight='balanced',max_iter=1000)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred)*100)

print(classification_report(y_test, y_pred, target_names=['Hate Speech', 'Offensive Language', 'Neither']))


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Hate Speech', 'Offensive Language', 'Neither'], yticklabels=['Hate Speech', 'Offensive Language', 'Neither'])
plt.title('Confusion Matrix Heatmap')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [7]:
pred_val = {0:"Hate Speech",1:"Offensive Language",2:"Neither"}

In [8]:
def predict_hate_speech(tweet):
    cleaned_tweet = clean_text(tweet)
    vectorized_tweet = vectorizer.transform([cleaned_tweet])
    prediction = model.predict(vectorized_tweet)[0]
    # return class_mapping[prediction]
    return pred_val[prediction]

In [9]:
while True:
    user_input = input("Enter text to check (or 'end' to exit): ")
    if user_input.lower() == 'end':
        break
    result = predict_hate_speech(user_input)
    print(f"Prediction: {result}\n")

In [None]:
import matplotlib.pyplot as plt

# Count the occurrences of each class
class_counts = data['class'].value_counts()

# Map class numbers to class names
class_labels = {0: "Hate Speech", 1: "Offensive Language", 2: "Neither"}
class_counts.index = class_counts.index.map(class_labels)

# Plotting the bar chart
plt.figure(figsize=(8, 5))
class_counts.plot(kind='bar', color=['red', 'orange', 'blue'])
plt.title("Distribution of Classes in the Dataset", fontsize=14)
plt.xlabel("Class", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks(rotation=0, fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Save or show the plot
# plt.savefig("class_distribution.png")  # Save as an image file (optional)
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=pred_val.values(), yticklabels=pred_val.values())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.show()


In [11]:
# UnComment this to save this model

# import pickle
# with open("hate_speech_model.pkl","wb") as model_file:
#     pickle.dump(model,model_file)
    
# with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
#     pickle.dump(vectorizer, vectorizer_file)

In [None]:
# # # List of test comments
test_comments = [
    # Examples

    "I can't stand people with black skin! They should all disappear.",
    "You're worthless because of your race. Go back to where you came from.",
    "All muslims are terrorists and should be banned.",
    
   
    "You're such an idiot, you can't even do anything right!",
    "Shut up, you moron. Nobody cares about your opinion.",
    "This project is shit and you are dumb!",
    
    
    "My name is Aditya, and I love programming.",
    "This is a great day to go for a walk!",
    "I think this topic is very interesting and deserves more attention."
]

# Check each comment and print the prediction
print("Testing Hate Speech Detection:")
for comment in test_comments:
    prediction = predict_hate_speech(comment)
    print(f"Comment: {comment}")
    print(f"Prediction: {prediction}")
    print("-" * 50)
