# Toxic Comment Classification
This notebook demonstrates a multi-label classification model for predicting six types of toxicity using TF-IDF and Logistic Regression.

## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

from sklearn.multiclass import OneVsRestClassifier
import re


## Step 2: Load and Clean the Data

In [None]:
# Load data
df = pd.read_csv("/Users/anugrahrastogi21gmail.com/Downloads/comments.csv")

# Fill missing comments
df["comment_text"].fillna("unknown", inplace=True)

# Basic text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove special chars
    text = re.sub(r"\s+", " ", text).strip()    # remove extra whitespace
    return text

df["clean_text"] = df["comment_text"].apply(clean_text)

# Display class distribution
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df[label_cols].sum().plot(kind="bar", title="Class distribution")
plt.show()


## Step 3: Split the Data

In [None]:
X = df["clean_text"]
y = df[label_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Step 4: TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


## Step 5: Train One-vs-Rest Logistic Regression

In [None]:
model = OneVsRestClassifier(LogisticRegression(solver='liblinear', C=1.0))
model.fit(X_train_tfidf, y_train)


## Step 6: Evaluate the Model

In [None]:
y_pred = model.predict(X_test_tfidf)

for i, label in enumerate(label_cols):
    print(f"\nClassification report for {label}:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i], zero_division=0))
