In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
df = pd.read_csv('train.csv')

# Display dataset info
print("Dataset Sample:\n", df.head())
print("\nDataset Info:\n", df.info())

# Combine toxic categories into a single target column
df['toxic_target'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
df['toxic_target'] = df['toxic_target'].apply(lambda x: 1 if x > 0 else 0)

# check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Fill missing comments with an empty string
df['comment_text'] = df['comment_text'].fillna("")

# Data preprocessing
def preprocess_text(text):
    text = text.lower() # Convert to lowercase
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
    text = re.sub("\d+", "", text)
    text = re.sub(r'\s', ' ', text)
    return text

df['cleaned_text'] = df['comment_text'].apply(preprocess_text)

# Splitting data into train and test sets
X = df['cleaned_text']
y = df['toxic_target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build the classification model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Testing the model with custom input
def predict_toxicity(comment):
    processed_comment = preprocess_text(comment)
    tfidf_comment = vectorizer.transform([processed_comment])
    prediction = model.predict(tfidf_comment)
    return "Toxic" if prediction[0] == 1 else "Non-Toxic"

# Test cases
test_comments = [
    "You are a horrible person!",
    "I really enjoyed working with you.",
    "What an idiot.",
    "Have a great day!"
]

print("\nCustom Predictions:")
for comment in test_comments:
    print(f"Comment: {comment} | Prediction: {predict_toxicity(comment)}")



Dataset Sample:
                  id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   D