# Logistic Regression Model

In [2]:
# Imports and Setup
import pandas as pd
import numpy as np
import sys
from pathlib import Path

from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import make_pipeline

# Add project root to path (handles __file__ if running as a script or defaults to current directory in notebooks)
project_root = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
sys.path.append(str(project_root))

# Import the custom feature extractor
from feature_engineering import ToxicFeatureExtractor

## Data Loading and Preprocessing

In [3]:
# Initialize feature extractor
feature_extractor = ToxicFeatureExtractor(data_dir='../data')

# Load the data
df = feature_extractor.load_data(dataset='train', verbose=True)
if df is None:
    raise FileNotFoundError('Training data not found. Please check your data directory.')

# Split data into training and validation sets
train_df, val_df = feature_extractor.split_data(test_size=0.2, random_state=42, verbose=True)

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Extract comment texts and corresponding multi-labels
train_comments = train_df['comment_text'].tolist()
val_comments = val_df['comment_text'].tolist()

train_labels = train_df[label_cols].values
val_labels = val_df[label_cols].values

Loading train data from ..\data\train.csv...
Loaded 159571 records with columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
Splitting data with test_size=0.2, random_state=42...
Loading train data from ..\data\train.csv...
Loaded 159571 records with columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
Split data into train (127656 records) and validation (31915 records)


## Feature Extraction

In [4]:
# Extract features for training data
print('Extracting training features...')
x_train = feature_extractor.extract_all_features(train_comments, verbose=True)

# Transform features for validation data
print('Transforming validation features...')
x_val = feature_extractor.transform_new_data(val_comments, verbose=True)

Extracting training features...
Beginning feature extraction for 127656 texts...
Preprocessing texts...
Preprocessing complete.
Vectorizing 127656 texts using TfidfVectorizer...
Vectorizer parameters: {'max_features': 5000, 'min_df': 2, 'max_df': 0.95, 'ngram_range': (1, 2)}
Vectorization complete. Matrix shape: (127656, 5000)
Top features: ['10' '100' '1000' '11' '12']...
Sample values (first row): [0. 0. 0. 0. 0.]
✓ Extracted tfidf features: (127656, 5000)
Creating Word2Vec embeddings for 127656 texts...
Model parameters: {'vector_size': 100, 'window': 5, 'min_count': 1, 'workers': 4}
Embedding complete. Matrix shape: (127656, 100)
Sample embeddings (first row, first 5 values): [ 0.12738855  0.61534131 -0.5805735  -0.19541949 -0.68308651]
✓ Extracted word2vec features: (127656, 100)
Creating Doc2Vec embeddings for 127656 texts...
Model parameters: {'vector_size': 100, 'min_count': 2, 'epochs': 40}
Embedding complete. Matrix shape: (127656, 100)
Sample embeddings (first row, first 5 v

## Logistic Model

In [None]:
# Build logistic regression model within a pipeline
base_log = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=5000, solver='lbfgs', class_weight='balanced')
)
model = MultiOutputClassifier(base_log)

# Fit the model on the training data
print('Training the model...')
model.fit(x_train, train_labels)

## SVM Model

In [None]:
from sklearn.svm import SVC

base_svm = make_pipeline(
    StandardScaler(),
    SVC(kernel='linear', class_weight='balanced', probability=True, max_iter=10000)
)

model = MultiOutputClassifier(base_svm)

print('Training the model...')
model.fit(x_train, train_labels)

Training the model...




## Model Evaluation

In [12]:
# Evaluate the model
print('Making predictions on validation data...')
predictions = model.predict(x_val)

for idx, col in enumerate(label_cols):
    print(f"\nClassification Report for '{col}':")
    print(classification_report(val_labels[:, idx], predictions[:, idx]))
    acc = accuracy_score(val_labels[:, idx], predictions[:, idx])
    print(f"Accuracy for '{col}': {acc:.4f}")

Making predictions on validation data...

Classification Report for 'toxic':
              precision    recall  f1-score   support

           0       1.00      0.00      0.00     28869
           1       0.10      1.00      0.17      3046

    accuracy                           0.10     31915
   macro avg       0.55      0.50      0.09     31915
weighted avg       0.91      0.10      0.02     31915

Accuracy for 'toxic': 0.0957

Classification Report for 'severe_toxic':
              precision    recall  f1-score   support

           0       0.98      0.05      0.09     31610
           1       0.01      0.88      0.02       305

    accuracy                           0.06     31915
   macro avg       0.49      0.46      0.05     31915
weighted avg       0.97      0.06      0.09     31915

Accuracy for 'severe_toxic': 0.0553

Classification Report for 'obscene':
              precision    recall  f1-score   support

           0       0.89      0.01      0.02     30253
           1  