In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib

# Step 1: Load your dataset (assuming you have a CSV with "comment" and "label" columns)
# Example: 'bullying_comments.csv' with columns 'comment' and 'label'
data = pd.read_excel('P:/Aswath/final/RDF_Project/unique_cyberbullying_comments.xlsx')

# Step 2: Preprocess the text (lowercase, remove non-alphabetic characters)
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing to all comments
data['comment'] = data['comment'].apply(preprocess_text)

# Step 3: Split data into training and testing sets
X = data['comment']  # Features: the comments
y = data['label']  # Labels: 0 or 1 (bullying or non-bullying)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features based on your dataset
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train a Naive Bayes model (you can experiment with other classifiers)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Step 6: Evaluate the model on the test set
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

# Step 7: Save the model and vectorizer for future use
joblib.dump(model, 'bullying_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved successfully.")


              precision    recall  f1-score   support

    bullying       1.00      1.00      1.00       203
non-bullying       1.00      1.00      1.00       197

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

Model and vectorizer saved successfully.
