In [20]:
#Model 1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load dataset
df = pd.read_csv("/kaggle/input/dataset-hate-speech/labeled_data.csv")
df = df.rename(columns={"class": "label", "tweet": "text"})
df = df[["text", "label"]]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Vectorize text
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

# Predict & evaluate
y_pred = model.predict(X_test_vec)

# Predict on new text
def predict(text):
    vec = vectorizer.transform([text])
    pred = model.predict(vec)[0]
    label_map = {0: "hate", 1: "offensive", 2: "neutral"}
    return label_map.get(pred, "unknown")

# Test sample
acc = accuracy_score(y_test, y_pred)
print(acc)
sample_text = "I hate those people!"
print("Prediction:", predict(sample_text))


0.893483962073835
Prediction: hate


In [21]:
#Model 2
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

# Load dataset
df = pd.read_csv("/kaggle/input/dataset-hate-speech/labeled_data.csv")
df = df.rename(columns={"class": "label", "tweet": "text"})
df = df[["text", "label"]]
df["label"] = df["label"].astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Load Sentence Transformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim vectors, fast

# Encode text
X_train_vec = embedder.encode(X_train.tolist(), show_progress_bar=True, convert_to_numpy=True)
X_test_vec = embedder.encode(X_test.tolist(), show_progress_bar=True, convert_to_numpy=True)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=["hate", "offensive", "neutral"]))

# Predict function
def predict(text):
    vec = embedder.encode([text])
    pred = clf.predict(vec)[0]
    label_map = {0: "hate", 1: "offensive", 2: "neutral"}
    return label_map[pred]

# Example prediction
print("Prediction:", predict("I hate those people."))



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/620 [00:00<?, ?it/s]

Batches:   0%|          | 0/155 [00:00<?, ?it/s]

              precision    recall  f1-score   support

        hate       0.46      0.16      0.24       290
   offensive       0.91      0.96      0.93      3832
     neutral       0.81      0.79      0.80       835

    accuracy                           0.88      4957
   macro avg       0.73      0.64      0.66      4957
weighted avg       0.87      0.88      0.87      4957



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Prediction: hate


In [22]:
#Model 3
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.utils.class_weight import compute_class_weight

# Load dataset
df = pd.read_csv("/kaggle/input/dataset-hate-speech/labeled_data.csv")
df = df.rename(columns={"class": "label", "tweet": "text"})
df = df[["text", "label"]]
df["label"] = df["label"].astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

# Load Sentence Transformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dim vectors, fast

# Encode text
X_train_vec = embedder.encode(X_train.tolist(), show_progress_bar=True, convert_to_numpy=True)
X_test_vec = embedder.encode(X_test.tolist(), show_progress_bar=True, convert_to_numpy=True)

# Train classifier

weights = compute_class_weight(class_weight='balanced', classes=[0, 1, 2], y=y_train)
class_weights = {i: w for i, w in enumerate(weights)}

clf = LogisticRegression(class_weight=class_weights, max_iter=1000)
clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred, target_names=["hate", "offensive", "neutral"]))

# Predict function
def predict(text):
    vec = embedder.encode([text])
    pred = clf.predict(vec)[0]
    label_map = {0: "hate", 1: "offensive", 2: "neutral"}
    return label_map[pred]

# Example prediction
print("Prediction:", predict("I hate those people."))

Batches:   0%|          | 0/620 [00:00<?, ?it/s]

Batches:   0%|          | 0/155 [00:00<?, ?it/s]

              precision    recall  f1-score   support

        hate       0.24      0.69      0.36       290
   offensive       0.97      0.79      0.87      3832
     neutral       0.71      0.87      0.78       835

    accuracy                           0.79      4957
   macro avg       0.64      0.78      0.67      4957
weighted avg       0.88      0.79      0.82      4957



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Prediction: hate
