<a href="https://colab.research.google.com/github/Afiya-0825/cyberbullying-tweet-classifier/blob/main/cyberbullying_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import re

# --- Your provided samples ---
bully_samples = [
    "You're so stupid and ugly", "Nobody wants you around", "I hate you",
    "Why don't you just disappear?", "Everyone hates you", "You are a total failure",
    "You’re so fat it’s disgusting", "Go kill yourself", "You're such a loser",
    "Stop pretending to be important", "You're annoying and useless",
    "Nobody cares about your opinion", "You're a joke", "Shut up already",
    "No one likes you anyway", "You look like trash", "You're a pathetic excuse",
    "Get lost freak", "I hope you fall off a cliff", "This is why nobody loves you",
    "You're the dumbest person I've ever met", "Why are you even alive?",
    "You're a pathetic joke", "You ruin everything", "Go cry somewhere else",
    "You're nothing but a failure", "I hate everything about you", "You deserve all the hate you get",
    "Shut up, no one asked you", "Stop embarrassing yourself", "Your voice is annoying",
    "Just stop talking forever", "You're a complete idiot", "You're not even funny, just sad",
    "You're worse than useless", "That outfit makes you look dumb", "You're trash at this game",
    "You’ll never be good enough", "No one wants to be your friend", "Just quit already",
    "You're the reason everything fails", "Disgusting person", "I want to fuck you",
    "Shut up and die", "You're a dumb bitch", "You’re a pathetic loser",
    "No one gives a damn about you", "You're annoying af", "Go fuck yourself",
    "You're ugly and disgusting", "Stop acting like a slut"
]

non_bully_samples = [
    "You're doing amazing, keep going 💪", "Let’s catch up soon!", "Congratulations on your win!",
    "That’s such a beautiful photo!", "I'm here if you need anything ❤️", "You are really talented!",
    "Thanks for helping me today 😊", "Wishing you a fantastic day!", "You inspire me every day",
    "What a wonderful idea!", "You're so thoughtful", "Great effort on the project",
    "That’s impressive work!", "You made my day better", "I appreciate your support",
    "Thanks for being kind", "You’re very intelligent", "That’s really generous of you",
    "You’re a wonderful friend", "That was really brave of you", "Always love your positivity",
    "I admire your honesty", "I’m glad you’re in my life", "Keep smiling, you're amazing 😄",
    "Such a kind message!", "You're the best!", "Well done on your progress!",
    "Proud of what you've achieved", "This is very inspiring", "Nice game, well played",
    "You are amazing!", "You're doing great", "Have a nice day",
    "That was really helpful, thank you!", "Keep up the good work",
    "I love your creativity", "What a beautiful message",
    "This made my day ❤️", "You’re such a kind soul",
    "Awesome job!", "Well done on your success",
    "Let’s meet for coffee soon!", "Thanks for your support",
    "You are so sweet", "Wishing you the best always",
    "Proud of you!", "That was thoughtful", "You inspire me",
    "Thanks again for helping me", "You’re truly a good friend","I like you", "You are nice", "I respect you", "You make me happy", "You are beautiful"

]

# Custom stopwords
stop_words = set("""
a about above after again against all am an and any are aren't as at be because been
before being below between both but by can't cannot could couldn't did didn't do does
doesn't doing don't down during each few for from further had hadn't has hasn't have
haven't having he he'd he'll he's her here here's hers herself him himself his how
how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most
mustn't my myself no nor not of off on once only or other ought our ours ourselves out
over own same shan't she she'd she'll she's should shouldn't so some such than that
that's the their theirs them themselves then there there's these they they'd they'll
they're they've this those through to too under until up very was wasn't we we'd we'll
we're we've were weren't what what's when when's where where's which while who who's
whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves
""".split())

# Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+|#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return " ".join([w for w in text.split() if w not in stop_words])

# Dataset
X = bully_samples + non_bully_samples
y = ["bullying"] * len(bully_samples) + ["not bullying"] * len(non_bully_samples)
X_clean = [clean_text(text) for text in X]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

# Train model
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("clf", LogisticRegression())
])
model.fit(X_train, y_train)

# Evaluation
y_pred = model.predict(X_test)
eval_report = classification_report(y_test, y_pred)

# Predict user input
def predict_tweet(tweet):
    cleaned = clean_text(tweet)
    proba = model.predict_proba([cleaned])[0]
    classes = model.named_steps["clf"].classes_
    bully_index = list(classes).index("bullying")
    confidence = proba[bully_index]
    label = "bullying" if confidence >= 0.5 else "not bullying"
    return label, round(confidence * 100, 2)

# ----------- 🔎 RUN INTERACTIVELY -----------
user_input = input("Enter a tweet/message: ")
label, confidence = predict_tweet(user_input)

print("\n--- Prediction ---")
print(f"Label      : {label.upper()}")
print(f"Confidence : {confidence}%")

print("\n--- Evaluation Report on Test Data ---")
print(eval_report)


Enter a tweet/message: nobody likes you

--- Prediction ---
Label      : BULLYING
Confidence : 58.78%

--- Evaluation Report on Test Data ---
              precision    recall  f1-score   support

    bullying       0.89      0.73      0.80        11
not bullying       0.77      0.91      0.83        11

    accuracy                           0.82        22
   macro avg       0.83      0.82      0.82        22
weighted avg       0.83      0.82      0.82        22

