In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import sys
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append("../")

In [None]:
train_df = pd.read_csv(
    "../input/train.csv",
    usecols=[
        "id",
        "comment_text",
        "toxic",
        "severe_toxic",
        "obscene",
        "threat",
        "insult",
        "identity_hate",
    ],
)
test_df = pd.read_csv("../input/test.csv", usecols=["id", "comment_text"])

# Rename columns in the DataFrame
columns_base = ["ID", "Comment_Text"]
columns_type = [
    "Is_Toxic",
    "Is_Severe_Toxic",
    "Is_Obscene",
    "Is_Threat",
    "Is_Insult",
    "Is_Identity_Hate",
]
columns_all = columns_base + columns_type
train_df.columns = columns_all
test_df.columns = columns_base

In [None]:
# CONSTANTS
LOGISTIC_REGRESSION = "LogisticRegression"
RANDOM_FORREST = "RandomForrest"
MODEL = LOGISTIC_REGRESSION
NUM_OF_ROWS = 10_000
RANDOM_SAMPLE = False
USE_TEST_DATASET = False
RUN_FULL_PCA = False

# EDA

In [None]:
train_df.head(5)

In [None]:
type_count = train_df[columns_type].sum()
total_samples = len(train_df)
type_percentage = (type_count / total_samples) * 100
print("Size of train dataset:")
print(train_df.shape)

rows_with_all_zeros = train_df[(train_df[columns_type] == 0).all(axis=1)]
print("\nCount of rows with all 0 types:", len(rows_with_all_zeros))

percentage_nonzero_types = 1 - (len(rows_with_all_zeros) / len(train_df))
print(
    "\nPercentage of rows with at least one non-zero type: {:.2%}".format(
        percentage_nonzero_types
    )
)

class_summary = pd.DataFrame({"Count": type_count, "Percentage": type_percentage})
class_summary["Percentage"] = class_summary["Percentage"].map("{:.2f}%".format)
print("\nSum for each type with added value, percentage and labels:")
print(class_summary)

In [None]:
comments_category = pd.DataFrame(
    {
        "Category": ["Good Comments", "Bad Comments"],
        "Count": [len(rows_with_all_zeros), len(train_df) - len(rows_with_all_zeros)],
    }
)

plt.figure(figsize=(8, 8))
plt.pie(
    comments_category["Count"],
    labels=comments_category["Category"],
    autopct="%1.2f%%",
    startangle=140,
)
plt.title("Distribution of Good and Bad Comments")
plt.show()

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
type_counts = {}
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    count = mask.sum()
    type_counts[text_type] = count
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat(
        [selected_rows_df, first_appearance], ignore_index=True
    )

print("Count of comments where only a specific type has 1 and others are 0:")
for text_type, count in type_counts.items():
    print(f"{text_type}: {count}")

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat(
        [selected_rows_df, first_appearance], ignore_index=True
    )

In [None]:
with pd.option_context("display.max_colwidth", None):
    display(selected_rows_df)

In [None]:
sample_stats = train_df.copy()

# Calculate and display number of words, characters, symbols, and capital letters separately for the original comment text
columns_stats = ["num_words", "num_chars", "num_symbols", "num_capital_letters"]
sample_stats["num_words"] = sample_stats["Comment_Text"].apply(
    lambda x: len(str(x).split())
)
sample_stats["num_chars"] = sample_stats["Comment_Text"].apply(len)
sample_stats["num_symbols"] = sample_stats["Comment_Text"].apply(
    lambda x: len(
        [char for char in str(x) if not char.isalnum() and not char.isspace()]
    )
)
sample_stats["num_capital_letters"] = sample_stats["Comment_Text"].apply(
    lambda x: sum(1 for char in str(x) if char.isupper())
)

In [None]:
def show_stats(columns_name):
    mean = np.mean(sample_stats[columns_name])
    median = np.median(sample_stats[columns_name])
    std_dev = np.std(sample_stats[columns_name])
    min_value = np.min(sample_stats[columns_name])
    max_value = np.max(sample_stats[columns_name])

    print("Mean:", mean)
    print("Median:", median)
    print("Standard Deviation:", std_dev)
    print("Minimum Value:", min_value)
    print("Maximum Value:", max_value)

In [None]:
# Create and display a histogram for the number of words
plt.figure(figsize=(8, 6))
plt.hist(
    sample_stats["num_words"],
    bins=range(min(sample_stats["num_words"]), max(sample_stats["num_words"]) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.title("Histogram of Number of Words")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")

print("Number of Words stats:\n")
show_stats("num_words")

plt.show()

In [None]:
# Create and display a histogram for the number of characters
plt.figure(figsize=(8, 6))
plt.hist(
    sample_stats["num_chars"],
    bins=range(min(sample_stats["num_chars"]), max(sample_stats["num_chars"]) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.title("Histogram of Number of Characters")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")

print("Number of Characters stats:\n")
show_stats("num_chars")

plt.show()

In [None]:
# Create and display a histogram for the number of symbols
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))

# Plot 1: Normal view
axes[0].hist(
    sample_stats["num_symbols"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[0].set_title("Histogram of Number of Symbols (Normal View)")
axes[0].set_xlabel("Number of Symbols")
axes[0].set_ylabel("Frequency")

# Plot 2: Zoomed-in x-axis
axes[1].hist(
    sample_stats["num_symbols"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[1].set_title("Histogram of Number of Symbols (Zoomed View)")
axes[1].set_xlabel("Number of Symbols")
axes[1].set_ylabel("Frequency")
axes[1].set_xlim(left=0, right=250)

print("Number of Symbols stats:\n")
show_stats("num_symbols")

plt.tight_layout()
plt.show()

In [None]:
# Create and display a histogram for the number of capital letters
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))

# Plot 1: Normal view
axes[0].hist(
    sample_stats["num_capital_letters"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[0].set_title("Histogram of Number of Capital Letters (Normal View)")
axes[0].set_xlabel("Number of Capital Letters")
axes[0].set_ylabel("Frequency")

# Plot 2: Zoomed-in x-axis
axes[1].hist(
    sample_stats["num_capital_letters"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[1].set_title("Histogram of Number of Capital Letters (Zoomed View)")
axes[1].set_xlabel("Number of Capital Letters")
axes[1].set_ylabel("Frequency")
axes[1].set_xlim(left=0, right=250)

print("Number of Capital Letters stats:\n")
show_stats("num_capital_letters")

plt.tight_layout()
plt.show()

In [None]:
correlation_matrix = sample_stats[columns_type + columns_stats].corr()
correlation_matrix

In [None]:
sample_stats.sort_values(by="num_capital_letters", ascending=False).head(10)

In [None]:
sample_stats.sort_values(by="num_symbols", ascending=False).head(10)

In [None]:
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

# Preprocessing

In [None]:
# Filter all hate comments for model training
hate_comments_df = (
    train_df[train_df[columns_type].any(axis=1)].copy().reset_index(drop=True)
)
print(f"Hate comments size: {len(hate_comments_df)}")

# Filter the same amount (or x2) of good comments for model training
good_comments_df = (
    train_df[train_df[columns_type].eq(0).all(axis=1)]
    .sample(n=3 * len(hate_comments_df), random_state=42)
    .copy()
    .reset_index(drop=True)
)
print(f"Good comments size: {len(good_comments_df)}")

# Concatenate 50% hate and 50% good comments and shuffle
train_df_copy = (
    pd.concat([hate_comments_df, good_comments_df], ignore_index=True)
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)
print(f"Final dataset size: {train_df_copy.shape}")

In [None]:
from src.preprocessing import do_preprocessing

train_df_copy["Comment_Text_Preprocessed"] = train_df_copy["Comment_Text"].apply(
    lambda d: " ".join(do_preprocessing(d))
)

In [None]:
X = train_df_copy["Comment_Text_Preprocessed"]
y = train_df_copy[columns_type]

tfidf_vectorizer = TfidfVectorizer(
    max_features=5_000, max_df=0.9, smooth_idf=True, use_idf=True
)
tfidf_matrix = tfidf_vectorizer.fit_transform(X)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [None]:
# Save tfidf_vectorizer
joblib.dump(tfidf_vectorizer, "./models/tfidf_vectorizer.joblib")

# Load tfidf_vectorizer
# tfidf_vectorizer = joblib.load('./models/tfidf_vectorizer.joblib')

In [None]:
train_df_copy_tfidf = pd.concat([train_df_copy, tfidf_df], axis=1)
print(train_df_copy.shape)
print(train_df_copy_tfidf.shape)
print(f"Unique words count: {len(feature_names)}")

In [None]:
# Display the top 100 most popular words
top_100_words = tfidf_df.sum().sort_values(ascending=False).head(100)
print(top_100_words.to_string())

In [None]:
# Check for any non numeric values in the features dataframe
tfidf_features = train_df_copy_tfidf[feature_names]
numeric_df = tfidf_features.apply(pd.to_numeric, errors="coerce")
nan_values = numeric_df.isna().sum().sum()

if nan_values == 0:
    print("All values in the DataFrame are numeric.")
else:
    print(f"There are {nan_values} non-numeric values in the DataFrame.")

# PCA

In [None]:
from sklearn.decomposition import PCA

n_components = 2
pca_2 = PCA(n_components=n_components)
pca_result_2 = pca_2.fit_transform(tfidf_features)
pca_result_df_2 = pd.DataFrame(
    data=pca_result_2, columns=[f"PCA_{i + 1}" for i in range(n_components)]
)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(pca_result_2[:, 0], pca_result_2[:, 1], alpha=0.5)
plt.title("2D Scatter Plot of PCA Components")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

In [None]:
print(pca_2.explained_variance_ratio_)

In [None]:
n_components = 3
pca_3 = PCA(n_components=n_components)
pca_result_3 = pca_3.fit_transform(tfidf_features)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    pca_result_3[:, 0],
    pca_result_3[:, 1],
    pca_result_3[:, 2],
    c="blue",
    marker="o",
    edgecolors="k",
)
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_zlabel("Principal Component 3")
ax.set_title("3D PCA Plot")
plt.show()

In [None]:
print(pca_3.explained_variance_ratio_)

In [None]:
# Calculate PCA with 0.95 explained variance
# pca = PCA(0.95)
# pca_result = pca.fit_transform(tfidf_features)
# exp_var_pca = pca.explained_variance_ratio_
# cum_sum_eigenvalues = np.cumsum(exp_var_pca)

In [None]:
# plt.bar(range(0, len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
# plt.step(range(0, len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid', label='Cumulative explained variance')
# plt.ylabel('Explained variance ratio')
# plt.xlabel('Principal component index')
# plt.legend(loc='best')
# plt.tight_layout()
#
# print(f"Number of components for 0.95 explained variance: {len(cum_sum_eigenvalues)}")
# plt.show()

# Clustering

In [None]:
from sklearn.cluster import KMeans

n_clusters = 7

# Apply KMeans on UMAP data
kmeans_pca = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans_pca_labels = kmeans_pca.fit_predict(pca_result_2)
labels = kmeans_pca_labels

In [None]:
# Visualize the clustering results
scatter = plt.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=kmeans_pca_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.scatter(
    kmeans_pca.cluster_centers_[:, 0],
    kmeans_pca.cluster_centers_[:, 1],
    s=200,
    c="red",
    marker="X",
    label="Centroids",
)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("K-Means Clustering")
plt.legend()

unique_labels = set(labels)
print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.colorbar(scatter)
plt.show()

# Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report, hamming_loss

In [None]:
X.head()

In [None]:
y.head()

In [None]:
class_labels = ["Toxic", "Severe_Toxic", "Obscene", "Threat", "Insult", "Identity_Hate"]

In [None]:
# Split Dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=101
)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
X_test_tfidf.shape

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    lr = LogisticRegression(max_iter=1000)
    clf = MultiOutputClassifier(lr)
    clf = clf.fit(X_train_tfidf, y_train)

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    # Save model
    joblib.dump(clf, "./models/logistic_regression_classifier_model.joblib")
    # Load model
    # clf = joblib.load('./models/logistic_regression_classifier_model.joblib')

In [None]:
# Get the class labels for each classifier
# for i, estimator in enumerate(clf.estimators_):
#     print(f"Classifier {i + 1} Class Labels:", estimator.classes_)

In [None]:
# 0.663 - 0.078 x/x  10000 features
# 0.762 - 0.056 x/2x 10000 features
# 0.764 - 0.055 x/2x 5000 features
# 0.815 - 0.043 x/3x 5000 features
# 0.872 - 0.03  x/5x 5000 features
# Evaluate the model
if MODEL == LOGISTIC_REGRESSION:
    y_pred = clf.predict(X_test_tfidf)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    hamming = round(hamming_loss(y_test, y_pred), 3)
    classification_report_str = classification_report(y_test, y_pred, zero_division=1)
    print("Accuracy Score: ", accuracy)
    print("Hamming Loss: ", hamming)
    print("Classification Report:")
    print(classification_report_str)

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    sample_text = ["some not very toxic toxic toxic text"]
    sample_text_tfidf = tfidf_vectorizer.transform(sample_text)
    sample_text_pred_prob = clf.predict_proba(sample_text_tfidf)
    prediction_df = pd.DataFrame()
    for i, output_name in enumerate(class_labels):
        prediction_df[output_name] = sample_text_pred_prob[i][:, 1]

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    print(prediction_df)

In [None]:
if MODEL == RANDOM_FORREST:
    base_classifier = RandomForestClassifier(random_state=42)
    multi_output_classifier = MultiOutputClassifier(base_classifier)
    multi_output_classifier = multi_output_classifier.fit(X_train_tfidf, y_train)

In [None]:
if MODEL == RANDOM_FORREST:
    # Save model
    joblib.dump(
        multi_output_classifier, "./models/random_forrest_classifier_model.joblib"
    )
    # Load model
    # multi_output_classifier = joblib.load('./models/random_forrest_classifier_model.joblib')

In [None]:
# Evaluate the model
if MODEL == RANDOM_FORREST:
    y_pred = multi_output_classifier.predict(X_test_tfidf)
    accuracy = round(accuracy_score(y_test, y_pred), 3)
    hamming = round(hamming_loss(y_test, y_pred), 3)
    classification_report_str = classification_report(y_test, y_pred, zero_division=1)
    print("Accuracy Score: ", accuracy)
    print("Hamming Loss: ", hamming)
    print("Classification Report:")
    print(classification_report_str)

In [None]:
if MODEL == RANDOM_FORREST:
    prediction_probabilities = multi_output_classifier.predict_proba(X_test_tfidf)
    prediction_df = pd.DataFrame()
    for i, output_name in enumerate(class_labels):
        prediction_df[output_name] = prediction_probabilities[i][:, 1]

In [None]:
if MODEL == RANDOM_FORREST:
    print(prediction_df.head())

In [None]:
if MODEL == RANDOM_FORREST:
    sample_text = ["some toxic text"]
    sample_text_tfidf = tfidf_vectorizer.transform(sample_text)
    sample_text_pred_prob = multi_output_classifier.predict_proba(sample_text_tfidf)
    prediction_df = pd.DataFrame()
    for i, output_name in enumerate(class_labels):
        prediction_df[output_name] = sample_text_pred_prob[i][:, 1]

In [None]:
if MODEL == RANDOM_FORREST:
    print(prediction_df)

# Prediction

In [None]:
columns_submission = [
    "id",
    "toxic",
    "severe_toxic",
    "obscene",
    "threat",
    "insult",
    "identity_hate",
]

In [None]:
test_df.head()

In [None]:
# Good comment
test_df.loc[test_df["ID"] == "00177176f33f587e"]

In [None]:
# Bad comment
test_df.loc[test_df["ID"] == "0013fed3aeae76b7"]

In [None]:
test_text_tfidf = tfidf_vectorizer.transform(test_df["Comment_Text"])

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    test_text_tfidf_prob = clf.predict_proba(test_text_tfidf)
    prediction_df = pd.DataFrame()
    for i, output_name in enumerate(class_labels):
        prediction_df[output_name] = test_text_tfidf_prob[i][:, 1]

In [None]:
if MODEL == LOGISTIC_REGRESSION:
    result_df = pd.concat([test_df["ID"], prediction_df], axis=1)
    result_df.columns = columns_submission
    result_df.to_csv("../output/submission.csv", index=False)
    result_df.head()

In [None]:
if MODEL == RANDOM_FORREST:
    test_text_tfidf_prob = multi_output_classifier.predict_proba(test_text_tfidf)
    prediction_df = pd.DataFrame()
    for i, output_name in enumerate(class_labels):
        prediction_df[output_name] = test_text_tfidf_prob[i][:, 1]

In [None]:
if MODEL == RANDOM_FORREST:
    result_df = pd.concat([test_df["ID"], prediction_df], axis=1)
    result_df.columns = columns_submission
    result_df.to_csv("../output/submission_random_forrest.csv", index=False)
    result_df.head()