In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
import re
import string
from nltk.corpus import stopwords
import numpy as np
from sklearn.utils import class_weight
import pickle
from scipy.sparse import hstack, csr_matrix
import nltk
from sklearn.preprocessing import MaxAbsScaler

# Data loading


In [2]:
try:
    df = pd.read_csv("train.csv", usecols=['comment_text', 'target'], nrows=10000) #Test load this file first! And change this parameters!
except FileNotFoundError:
    print("Error: 'final_train.csv' not found.  Make sure the file is in the same directory as your script or specify the correct path.")
    exit()

# 1. Data Cleaning and Preprocessing

In [3]:
df.dropna(subset=['comment_text', 'target'], inplace=True)
df['target'] = pd.to_numeric(df['target'], errors='coerce')
df.dropna(subset=['target'], inplace=True)
df['target'] = (df['target'] > 0.5).astype(int)

# 2. Text Cleaning
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

df['comment_text'] = df['comment_text'].apply(clean_text)

# 3. Feature Engineering (TF-IDF)

In [5]:
profane_words = ["shit", "fuck", "ass", "bitch", "cunt", "dick", "piss", "cock", "bastard", "faggot", "idiot", "stupid", "moron", "trash", "garbage", 'HELL', "hell"]
toxicity_terms = {
    "insult": ["idiot", "stupid", "moron", "fool", "jerk", "loser"],
    "hate_speech": ["racist", "sexist", "bigot", "nazi","rapist"],
    "threat": ["kill", "hit", "hurt", "destroy", "attack"],
    "harassment": ["stalk", "bully", "abuse", "mock", "tease"],
    "offensive": ["trash", "garbage", "worthless", "pathetic"]
}

# Create features for individual profane words
for word in profane_words:
    df[f'profane_{word}'] = df['comment_text'].apply(lambda x: x.count(word))

# Create features for individual toxicity terms
for category, terms in toxicity_terms.items():
    for word in terms:
        df[f'{category}_{word}'] = df['comment_text'].apply(lambda x: x.count(word))

# 4. Split data into training and testing sets

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=1500,ngram_range=(1,2))  # Increased max_features and add a ngram features
tfidf_matrix = tfidf_vectorizer.fit_transform(df['comment_text'])

# 5. Combine Features - Scaled and then Combined

In [7]:
X_additional = df[[col for col in df.columns if col.startswith('profane_') or col.startswith('insult_') or col.startswith('hate_speech_') or col.startswith('threat_') or col.startswith('harassment_') or col.startswith('offensive_')]].values #Fixed
scaler = MaxAbsScaler()
X_additional_scaled = scaler.fit_transform(X_additional)
X_additional_sparse = csr_matrix(X_additional_scaled) #Convert to sparse
X = hstack([tfidf_matrix, X_additional_sparse])

# 6. Split Data - Stratified for better class representation

In [8]:
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

del df, tfidf_matrix, X_additional, X_additional_scaled # Clean up memory

# 7. Class Weights - Important for imbalanced datasets

In [9]:
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
train_weights = np.take(class_weights, y_train)

# 8. XGBoost Model - Tuned Parameters

In [10]:
xgboost_classifier = xgb.XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=0.9,  # weight the toxic
    learning_rate=0.05,
    max_depth=6,
    n_estimators=600,
    subsample=0.7,
    colsample_bytree=0.7,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1
)

# 9. Training and  Evaluation

In [11]:
# Training
xgboost_classifier.fit(X_train, y_train, sample_weight=train_weights)

# Evaluation
y_pred = xgboost_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.942

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      1915
           1       0.35      0.41      0.38        85

    accuracy                           0.94      2000
   macro avg       0.66      0.69      0.67      2000
weighted avg       0.95      0.94      0.94      2000



# 10. Save Model

In [12]:
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgboost_classifier, f)

print("Models saved successfully!")

Models saved successfully!
