In [1]:
import pandas as pd
import os

# Build correct path to the cleaned dataset
dataset_path = os.path.join("..", "Data", "cleaned_dataset.csv")

# Check if file exists
print("Looking for:", dataset_path)
print("Exists?", os.path.exists(dataset_path))

# Load dataset
df = pd.read_csv(dataset_path)

# Preview
df.head()


Looking for: ..\Data\cleaned_dataset.csv
Exists? True


Unnamed: 0,ID,comment,label
0,1,aroob jahil said i didnt knew lol i didnt know...,CA
1,2,always randi rona iska but no one take it seri...,CA
2,3,achha dikhana aur english bolna koi kaabliyat ...,CA
3,4,i hate jahil e azam rajab ghatia,CA
4,5,besharam banda he to,CA


In [2]:
df.columns


Index(['ID', 'comment', 'label'], dtype='object')

In [4]:
# Step 3.1: Check for missing comments
print("Missing comments:", df['comment'].isnull().sum())

# Step 3.2: Drop rows with missing comments
df = df.dropna(subset=['comment'])

# Step 3.3: Reset index
df = df.reset_index(drop=True)

print("After removing missing comments, dataset shape:", df.shape)


Missing comments: 1
After removing missing comments, dataset shape: (3040, 3)


In [5]:
# Step 4: Vectorize text
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vectors = vectorizer.fit_transform(df['comment'])

y = df['label']

print("Vectorization Done! Shape:", X_vectors.shape)


Vectorization Done! Shape: (3040, 5000)


In [8]:
import pickle
import os

# Folder path
app_folder = os.path.join(os.path.expanduser("~"), "Desktop", "FinalSubmission", "FinalApplication")

# Create folder if not exists
if not os.path.exists(app_folder):
    os.makedirs(app_folder)

# Save model and vectorizer
pickle.dump(model, open(os.path.join(app_folder, "model.pkl"), "wb"))
pickle.dump(vectorizer, open(os.path.join(app_folder, "vectorizer.pkl"), "wb"))

print("Model and Vectorizer saved successfully in FinalApplication!")


Model and Vectorizer saved successfully in FinalApplication!


In [10]:
import pickle
import os

app_folder = os.path.join(os.path.expanduser("~"), "Desktop", "FinalSubmission", "FinalApplication")

model = pickle.load(open(os.path.join(app_folder, "model.pkl"), "rb"))
vectorizer = pickle.load(open(os.path.join(app_folder, "vectorizer.pkl"), "rb"))

print("Model and Vectorizer loaded successfully!")


Model and Vectorizer loaded successfully!


In [12]:
# Drop rows where comment is NaN
df = df.dropna(subset=['comment']).reset_index(drop=True)


In [14]:
# Remove any rows where 'comment' is missing, so vectorizer doesn't fail
df = df.dropna(subset=['comment']).reset_index(drop=True)


In [15]:
# ===============================
# Model Training for Cyber Abuse Detection
# ===============================

# Step 1: Import Libraries
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Step 2: Load Cleaned Dataset
dataset_path = os.path.join(
    os.path.expanduser("~"), 
    "Desktop", "FinalSubmission", "Data", "cleaned_dataset.csv"
)
df = pd.read_csv(dataset_path)

# Step 3: Handle missing comments
# Remove any rows where 'comment' is missing, so vectorizer doesn't fail
df = df.dropna(subset=['comment']).reset_index(drop=True)

# Step 4: Features and Labels
X = df['comment']
y = df['label']

# Step 5: Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_vectors = vectorizer.fit_transform(X)
print("Vectorization Done! Shape:", X_vectors.shape)

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=0.2, random_state=42
)

# Step 7: Train Logistic Regression Model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Step 8: Make Predictions & Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 9: Save Model and Vectorizer
app_folder = os.path.join(
    os.path.expanduser("~"), 
    "Desktop", "FinalSubmission", "FinalApplication"
)
if not os.path.exists(app_folder):
    os.makedirs(app_folder)

model_path = os.path.join(app_folder, "model.pkl")
vectorizer_path = os.path.join(app_folder, "vectorizer.pkl")

pickle.dump(model, open(model_path, "wb"))
pickle.dump(vectorizer, open(vectorizer_path, "wb"))

print("✅ Model and Vectorizer saved successfully in FinalApplication folder!")


Vectorization Done! Shape: (3040, 5000)
Accuracy: 0.9013157894736842
              precision    recall  f1-score   support

          CA       0.99      0.64      0.78       164
         NCA       0.88      1.00      0.94       444

    accuracy                           0.90       608
   macro avg       0.94      0.82      0.86       608
weighted avg       0.91      0.90      0.89       608

✅ Model and Vectorizer saved successfully in FinalApplication folder!
