In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import sys
from pandarallel import pandarallel

sys.path.append("../")
pandarallel.initialize(progress_bar=True)

In [None]:
train_df = pd.read_csv("../input/train.csv", usecols=["id", "comment_text"])
test_df = pd.read_csv("../input/test.csv", usecols=["id", "comment_text"])

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
# CONSTANTS
NUM_OF_ROWS = 10_000
RANDOM_SAMPLE = False
USE_TEST_DATASET = False
RUN_FULL_PCA = False

### EDA

In [None]:
# EDA for train dataset
print(f"Size of train dataset: {train_df.shape}")
print(f"Size of test dataset: {test_df.shape}")

In [None]:
if USE_TEST_DATASET:
    train_df = test_df

In [None]:
train_df.head(5)

In [None]:
train_df.head(5)

In [None]:
# Function for tokenization
def tokenize_and_preprocess(text):
    tokens = nltk.word_tokenize(text)
    return [token.lower() for token in tokens if token.isalpha()]

In [None]:
comments = train_df["comment_text"].tolist()
length_of_comments = [len(comment) for comment in comments]

plt.hist(
    length_of_comments,
    bins=range(min(length_of_comments), max(length_of_comments) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.xlabel("Length of Comments")
plt.ylabel("Number of Comments")
plt.title("Histogram of Length of Comments")
plt.show()

In [None]:
sample_stats = train_df.copy()

# Calculate and display number of words, characters, symbols, and capital letters separately for the original comment text
sample_stats["num_words"] = sample_stats["comment_text"].apply(
    lambda x: len(str(x).split())
)
sample_stats["num_tokens"] = sample_stats["comment_text"].apply(
    lambda x: len(tokenize_and_preprocess(str(x)))
)
sample_stats["num_chars"] = sample_stats["comment_text"].apply(len)
sample_stats["num_symbols"] = sample_stats["comment_text"].apply(
    lambda x: len(
        [char for char in str(x) if not char.isalnum() and not char.isspace()]
    )
)
sample_stats["num_capital_letters"] = sample_stats["comment_text"].apply(
    lambda x: sum(1 for char in str(x) if char.isupper())
)

In [None]:
sample_stats.shape

In [None]:
# Create and display a histogram for the number of words
plt.figure(figsize=(8, 6))
plt.hist(
    sample_stats["num_words"],
    bins=range(min(sample_stats["num_words"]), max(sample_stats["num_words"]) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.title("Histogram of Number of Words")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")

print("Number of Words stats:\n")

mean = np.mean(sample_stats["num_words"])
median = np.median(sample_stats["num_words"])
std_dev = np.std(sample_stats["num_words"])
min_value = np.min(sample_stats["num_words"])
max_value = np.max(sample_stats["num_words"])

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)
print("Minimum Value:", min_value)
print("Maximum Value:", max_value)

plt.show()

In [None]:
# Create and display a histogram for the number of words
plt.figure(figsize=(8, 6))
plt.hist(
    sample_stats["num_tokens"],
    bins=range(min(sample_stats["num_tokens"]), max(sample_stats["num_tokens"]) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.title("Histogram of Number of Tokens")
plt.xlabel("Number of Tokens")
plt.ylabel("Frequency")

print("Number of Tokens stats:\n")

mean = np.mean(sample_stats["num_tokens"])
median = np.median(sample_stats["num_tokens"])
std_dev = np.std(sample_stats["num_tokens"])
min_value = np.min(sample_stats["num_tokens"])
max_value = np.max(sample_stats["num_tokens"])

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)
print("Minimum Value:", min_value)
print("Maximum Value:", max_value)

plt.show()

In [None]:
# Create and display a histogram for the number of characters
plt.figure(figsize=(8, 6))
plt.hist(
    sample_stats["num_chars"],
    bins=range(min(sample_stats["num_chars"]), max(sample_stats["num_chars"]) + 1),
    edgecolor="black",
    color="skyblue",
    lw=0,
)
plt.title("Histogram of Number of Characters")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")

print("Number of Characters stats:\n")

mean = np.mean(sample_stats["num_chars"])
median = np.median(sample_stats["num_chars"])
std_dev = np.std(sample_stats["num_chars"])
min_value = np.min(sample_stats["num_chars"])
max_value = np.max(sample_stats["num_chars"])

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)
print("Minimum Value:", min_value)
print("Maximum Value:", max_value)

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))

# Plot 1: Normal view
axes[0].hist(
    sample_stats["num_symbols"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[0].set_title("Histogram of Number of Symbols (Normal View)")
axes[0].set_xlabel("Number of Symbols")
axes[0].set_ylabel("Frequency")

# Plot 2: Zoomed-in x-axis
axes[1].hist(
    sample_stats["num_symbols"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[1].set_title("Histogram of Number of Symbols (Zoomed View)")
axes[1].set_xlabel("Number of Symbols")
axes[1].set_ylabel("Frequency")
axes[1].set_xlim(left=0, right=250)

plt.tight_layout()

print("Number of Symbols stats:\n")

mean = np.mean(sample_stats["num_symbols"])
median = np.median(sample_stats["num_symbols"])
std_dev = np.std(sample_stats["num_symbols"])
min_value = np.min(sample_stats["num_symbols"])
max_value = np.max(sample_stats["num_symbols"])

print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)
print("Minimum Value:", min_value)
print("Maximum Value:", max_value)

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 12))

# Plot 1: Normal view
axes[0].hist(
    sample_stats["num_capital_letters"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[0].set_title("Histogram of Number of Capital Letters (Normal View)")
axes[0].set_xlabel("Number of Capital Letters")
axes[0].set_ylabel("Frequency")

# Plot 2: Zoomed-in x-axis
axes[1].hist(
    sample_stats["num_capital_letters"],
    bins=range(min(sample_stats["num_symbols"]), max(sample_stats["num_symbols"]) + 1),
    color="purple",
    edgecolor="black",
)
axes[1].set_title("Histogram of Number of Capital Letters (Zoomed View)")
axes[1].set_xlabel("Number of Capital Letters")
axes[1].set_ylabel("Frequency")
axes[1].set_xlim(left=0, right=250)

plt.tight_layout()

print("Number of Capital Letters stats:\n")

mean_cap_letters = np.mean(sample_stats["num_capital_letters"])
median_cap_letters = np.median(sample_stats["num_capital_letters"])
std_dev_cap_letters = np.std(sample_stats["num_capital_letters"])
min_value_cap_letters = np.min(sample_stats["num_capital_letters"])
max_value_cap_letters = np.max(sample_stats["num_capital_letters"])

print("Mean:", mean_cap_letters)
print("Median:", median_cap_letters)
print("Standard Deviation:", std_dev_cap_letters)
print("Minimum Value:", min_value_cap_letters)
print("Maximum Value:", max_value_cap_letters)

plt.show()

In [None]:
eda_df = pd.read_csv("../input/train.csv")
eda_test_df = pd.read_csv("../input/test.csv")

In [None]:
import matplotlib.pyplot as plt

# Count the number of samples that belong to each category
category_counts = eda_df[
    ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].sum()
category_counts["none"] = (
    eda_df[
        ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    ].sum(axis=1)
    == 0
).sum()

# Plotting the histogram
category_counts.plot(kind="bar", figsize=(10, 6), logy=True)

plt.title("Number of samples per category")
plt.xlabel("Category")
plt.ylabel("Number of samples")

# Show the histogram
plt.show()

In [None]:
def calculate_stats(text):
    num_chars = len(text)
    num_words = len(text.split())
    num_capitals = sum(1 for c in text if c.isupper())
    num_symbols = sum(1 for c in text if not c.isalnum())

    # Calculate percentages
    percent_capitals = (num_capitals / num_chars * 100) if num_chars > 0 else 0
    percent_symbols = (num_symbols / num_chars * 100) if num_chars > 0 else 0

    return pd.Series(
        [
            num_chars,
            num_words,
            num_capitals,
            num_symbols,
            percent_capitals,
            percent_symbols,
        ],
        index=[
            "num_chars",
            "num_words",
            "num_capitals",
            "num_symbols",
            "percent_capitals",
            "percent_symbols",
        ],
    )


# Convert the list of dictionaries into a DataFrame
eda_stats_df = pd.DataFrame(eda_df["comment_text"].apply(calculate_stats))
eda_test_stats_df = pd.DataFrame(eda_test_df["comment_text"].apply(calculate_stats))

In [None]:
# Plot the number of capital letters in each sample
plt.figure(figsize=(10, 6))
plt.hist(eda_stats_df["percent_capitals"], bins=150, alpha=0.5, label="Train")
plt.hist(eda_test_stats_df["percent_capitals"], bins=150, alpha=0.5, label="Test")
plt.xlabel("Percentage of Capital Letters")
plt.ylabel("Frequency")
plt.title("Percentage of Capital Letters per Sample")
plt.legend()
plt.show()

In [None]:
# Plot the number of symbols in each sample
plt.figure(figsize=(10, 6))
plt.hist(eda_stats_df["percent_symbols"], bins=150, alpha=0.5, label="Train")
plt.hist(eda_test_stats_df["percent_symbols"], bins=150, alpha=0.5, label="Test")
plt.xlabel("Percentage of Symbols")
plt.ylabel("Frequency")
plt.title("Percentage of Symbols per Sample")
plt.legend()
plt.show()

### Dataset Preprocessing

In [None]:
import string
import re
from num2words import num2words

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_punctuation = string.punctuation
stop_words = set(nltk.corpus.stopwords.words("english"))


def to_lower_case(text):
    return "".join([i.lower() for i in text])


def remove_punctuation(text):
    return "".join([i for i in text if i not in stop_punctuation])


def remove_long_dash(text):
    return re.sub(r"—", " ", text)


def remove_urls(text):
    return re.sub(r"http\S+", "", text)


def remove_one_letter_words(tokens):
    return list(filter(lambda token: len(token) > 1, tokens))


def tokenize_text(text):
    return nltk.tokenize.word_tokenize(text)


def remove_stop_words(tokens):
    # avoid_stop_words = {"not", "n't", "no"}
    # stop_words = stop_words - avoid_stop_words
    return [i for i in tokens if i not in stop_words]


def do_stemming(tokens):
    ps = nltk.PorterStemmer()
    return [ps.stem(word) for word in tokens]


def do_lemmatization(tokens):
    wn = nltk.WordNetLemmatizer()
    return [wn.lemmatize(word) for word in tokens]


def remove_numeric_words(text):
    return re.sub(r"\S*\d+\S*", "", text)


def convert_nums_to_words(data):
    tokens = data
    new_text = []
    for word in tokens:
        if word.isdigit():
            if int(word) < 1000000000:
                word = num2words(word)
            else:
                word = ""
        new_text.extend(tokenize_text(re.sub("(-|,\s?)|\s+", " ", word)))
    return new_text


def do_preprocessing(data):
    text_clean = data
    text_clean = remove_urls(text_clean)
    text_clean = remove_punctuation(text_clean)
    text_clean = remove_long_dash(text_clean)
    text_clean = to_lower_case(text_clean)
    text_clean = remove_numeric_words(text_clean)
    words = tokenize_text(text_clean)
    words = remove_one_letter_words(words)
    words = remove_stop_words(words)
    lemmatized = do_lemmatization(words)
    res = convert_nums_to_words(lemmatized)
    return res

In [None]:
# train_df['comment_text_preprocessed'] = train_df["comment_text"].apply(lambda d: " ".join(do_preprocessing(d)))

In [None]:
if RANDOM_SAMPLE:
    train_df_copy = (
        train_df.sample(NUM_OF_ROWS, random_state=42).copy().reset_index(drop=True)
    )
else:
    train_df_copy = train_df.head(NUM_OF_ROWS).copy()

In [None]:
train_df_copy["comment_text_preprocessed"] = train_df_copy["comment_text"].apply(
    lambda d: " ".join(do_preprocessing(d))
)

In [None]:
train_df_copy.shape

In [None]:
train_df_copy.head(5)

In [None]:
pd.set_option("display.max_colwidth", None)
for i in range(10):
    print(f"Original Comment Text {i + 1}:\n", train_df_copy["comment_text"].iloc[i])
    print(
        f"\nPreprocessed Comment Text {i + 1}:\n",
        train_df_copy["comment_text_preprocessed"].iloc[i],
    )
    print("-" * 50)  # Separator

In [None]:
tfidf_vectorizer = TfidfVectorizer(smooth_idf=True, use_idf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(
    train_df_copy["comment_text_preprocessed"]
)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [None]:
tfidf_df.shape

In [None]:
train_df_copy.shape

In [None]:
# train_df_copy.reset_index(drop=True)
train_df_copy_tfidf = pd.concat([train_df_copy, tfidf_df], axis=1)
print(train_df_copy.shape)
print(train_df_copy_tfidf.shape)
print(f"Unique words count: {len(feature_names)}")

In [None]:
if "id" in feature_names:
    feature_names = np.setdiff1d(feature_names, ["id"])

In [None]:
# Display the top 100 most popular words
top_100_words = tfidf_df.sum().sort_values(ascending=False).head(100)
print(top_100_words.to_string())

In [None]:
# Find the most weighted word for each text
train_df_copy["best_word"] = ""

for i in range(tfidf_matrix.shape[0]):  # Iterate through each text
    best_word_index = np.argmax(tfidf_matrix[i])
    best_word = feature_names[best_word_index]
    train_df_copy.at[i, "best_word"] = best_word

print(train_df_copy[["comment_text_preprocessed", "best_word"]].head(10))

In [None]:
# Check for any non numeric values in the features dataframe
tfidf_features = train_df_copy_tfidf[feature_names]
numeric_df = tfidf_features.apply(pd.to_numeric, errors="coerce")
nan_values = numeric_df.isna().sum().sum()

if nan_values == 0:
    print("All values in the DataFrame are numeric.")
else:
    print(f"There are {nan_values} non-numeric values in the DataFrame.")

### PCA

In [None]:
from sklearn.decomposition import PCA

if RUN_FULL_PCA:
    pca = PCA()
    pca.fit(tfidf_features)

    explained_variance_ratios = pca.explained_variance_ratio_
    cumulative_explained_variance = np.cumsum(explained_variance_ratios)

    components_for_08 = np.argmax(cumulative_explained_variance >= 0.8) + 1
    components_for_09 = np.argmax(cumulative_explained_variance >= 0.9) + 1
    components_for_095 = np.argmax(cumulative_explained_variance >= 0.95) + 1

    plt.figure(figsize=(10, 6))
    plt.plot(cumulative_explained_variance, marker="o", linestyle="-", color="b")
    plt.title("Cumulative Explained Variance vs. Number of Components")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Explained Variance")
    plt.axvline(
        components_for_08, color="r", linestyle="--", label="0.8 Explained Variance"
    )
    plt.axvline(
        components_for_09, color="g", linestyle="--", label="0.9 Explained Variance"
    )
    plt.axvline(
        components_for_095,
        color="purple",
        linestyle="--",
        label="0.95 Explained Variance",
    )
    plt.legend()
    plt.show()

    print(f"Number of components for 0.8 explained variance: {components_for_08}")
    print(f"Number of components for 0.9 explained variance: {components_for_09}")
    print(f"Number of components for 0.95 explained variance: {components_for_095}")

In [None]:
if RUN_FULL_PCA:
    plt.plot(pca.explained_variance_ratio_ / np.sum(pca.explained_variance_ratio_))

In [None]:
if RUN_FULL_PCA:
    n_components = 10
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(tfidf_features)

    exp_var_pca = pca.explained_variance_ratio_

    cum_sum_eigenvalues = np.cumsum(exp_var_pca)

    plt.bar(
        range(0, len(exp_var_pca)),
        exp_var_pca,
        alpha=0.5,
        align="center",
        label="Individual explained variance",
    )
    plt.step(
        range(0, len(cum_sum_eigenvalues)),
        cum_sum_eigenvalues,
        where="mid",
        label="Cumulative explained variance",
    )
    plt.ylabel("Explained variance ratio")
    plt.xlabel("Principal component index")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
if RUN_FULL_PCA:
    plt.xticks(np.arange(1, n_components))
    plt.plot(pca.explained_variance_ratio_ / np.sum(pca.explained_variance_ratio_))

In [None]:
# pca_80 = PCA(n_components=0.8)
# data_pca_80 = pca_80.fit_transform(tfidf_features)
# print(data_pca_80.shape)

In [None]:
# pca_90 = PCA(n_components=0.9)
# data_pca_90 = pca_90.fit_transform(tfidf_features)
# print(data_pca_90.shape)

In [None]:
# pca_95 = PCA(n_components=0.95)
# data_pca_95 = pca_95.fit_transform(tfidf_features)
# print(data_pca_95.shape)

In [None]:
n_components = 2
pca_2 = PCA(n_components=n_components)
pca_result_2 = pca_2.fit_transform(tfidf_features)
pca_result_df_2 = pd.DataFrame(
    data=pca_result_2, columns=[f"PCA_{i + 1}" for i in range(n_components)]
)
# Add PCA results to the original DataFrame
train_df_copy = pd.concat([train_df_copy, pca_result_df_2], axis=1)
# train_df_copy.reset_index(drop=True)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(pca_result_2[:, 0], pca_result_2[:, 1], alpha=0.5)
plt.title("2D Scatter Plot of PCA Components")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

In [None]:
print(pca_2.explained_variance_ratio_)

In [None]:
n_components = 3
pca_3 = PCA(n_components=n_components)
pca_result_3 = pca_3.fit_transform(tfidf_features)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(
    pca_result_3[:, 0],
    pca_result_3[:, 1],
    pca_result_3[:, 2],
    c="blue",
    marker="o",
    edgecolors="k",
)
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.set_zlabel("Principal Component 3")
ax.set_title("3D PCA Plot")
plt.show()

In [None]:
print(pca_3.explained_variance_ratio_)

### UMAP

In [None]:
import umap.umap_ as umap

n_components = 2
umap_model = umap.UMAP(n_components=n_components)
umap_result_2 = umap_model.fit_transform(tfidf_features)
umap_result_df_2 = pd.DataFrame(
    data=umap_result_2, columns=[f"UMAP_{i + 1}" for i in range(n_components)]
)
# Add PCA results to the original DataFrame
train_df_copy = pd.concat([train_df_copy, umap_result_df_2], axis=1)

In [None]:
plt.scatter(
    umap_result_2[:, 0], umap_result_2[:, 1], c="blue", marker="o", edgecolors="k"
)
plt.xlabel(f"UMAP Component 1")
plt.ylabel(f"UMAP Component 2")
plt.title(f"UMAP Plot ({n_components} Components)")
plt.show()

In [None]:
n_components = 3
umap_model = umap.UMAP(n_components=n_components)
umap_result_3 = umap_model.fit_transform(tfidf_features)

In [None]:
# Create a scatter plot
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d")
scatter = ax.scatter(
    umap_result_3[:, 0],
    umap_result_3[:, 1],
    umap_result_3[:, 2],
    c="blue",
    marker="o",
    edgecolors="k",
)
ax.set_xlabel("UMAP Component 1")
ax.set_ylabel("UMAP Component 2")
ax.set_zlabel("UMAP Component 3")
plt.title(f"UMAP Plot ({n_components} Components)")
plt.show()

### Clustering

#### KMeans

In [None]:
from sklearn.cluster import KMeans

n_clusters = 6

kmeans_pca = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans_pca_labels = kmeans_pca.fit_predict(pca_result_2)
labels = kmeans_pca_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["KMeans_PCA_Cluster_Labels"] = kmeans_pca_labels

In [None]:
# Visualize the clustering results
plt.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=kmeans_pca_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.scatter(
    kmeans_pca.cluster_centers_[:, 0],
    kmeans_pca.cluster_centers_[:, 1],
    s=200,
    c="red",
    marker="X",
    label="Centroids",
)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("K-Means Clustering")
plt.legend()

unique_labels = set(labels)
print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

In [None]:
# Set the range of cluster numbers to try
k_values = range(1, 15)

# Calculate the sum of squared distances for each k
inertia_values = []
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(pca_result_2)
    inertia_values.append(kmeans.inertia_)

# Plot the Elbow Method
plt.plot(k_values, inertia_values, marker="o")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Sum of Squared Distances")
plt.title("Elbow Method for Optimal K")
plt.show()

In [None]:
n_clusters = 6

# Apply KMeans on UMAP data
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans_umap_labels = kmeans.fit_predict(umap_result_2)
labels = kmeans_umap_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["KMeans_UMAP_Cluster_Labels"] = labels

In [None]:
scatter = plt.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    s=200,
    c="red",
    marker="X",
    label="Centroids",
)
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.title("K-Means Clustering")
plt.legend()
plt.colorbar(scatter)

unique_labels = set(labels)
print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

plt.show()

In [None]:
k_values = range(1, 11)

inertia_values = []
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(umap_result_2)
    inertia_values.append(kmeans.inertia_)

plt.plot(k_values, inertia_values, marker="o")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Sum of Squared Distances")
plt.title("Elbow Method for Optimal K")
plt.show()

#### DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

# Adjust DBSCAN
# Experiment with different values for eps and min_samples
eps_values = [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01]
min_samples_values = [5, 10, 25, 50, 100, 250]

best_num_clusters = None
best_eps = None
best_min_samples = None

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_pca_labels = dbscan.fit_predict(pca_result_2)
        num_clusters = len(set(dbscan_pca_labels)) - (
            1 if -1 in dbscan_pca_labels else 0
        )

        if best_num_clusters is None or num_clusters > best_num_clusters:
            best_num_clusters = num_clusters
            best_eps = eps
            best_min_samples = min_samples

In [None]:
print(best_eps)
print(best_min_samples)

In [None]:
# Apply DBSCAN on PCA data
# Use the best parameters
dbscan = DBSCAN(eps=best_eps, min_samples=best_min_samples)
dbscan_pca_labels = dbscan.fit_predict(pca_result_2)
labels = dbscan_pca_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["DBSCAN_PCA_Cluster_Labels"] = labels

In [None]:
unique_labels = set(labels)

fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)

cbar = plt.colorbar(scatter, ticks=np.unique(labels))
cbar.set_label("Cluster Labels", rotation=270, labelpad=15)

ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
ax.set_title("DBSCAN Clustering")

print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

In [None]:
from sklearn.cluster import DBSCAN

# Adjust DBSCAN
# Experiment with different values for eps and min_samples
eps_values = [1, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01]
min_samples_values = [5, 10, 25, 50, 100, 250]

best_num_clusters = None
best_eps = None
best_min_samples = None

for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        dbscan_pca_labels = dbscan.fit_predict(umap_result_2)
        num_clusters = len(set(dbscan_pca_labels)) - (
            1 if -1 in dbscan_pca_labels else 0
        )

        # Update if better parameters found
        if best_num_clusters is None or num_clusters > best_num_clusters:
            best_num_clusters = num_clusters
            best_eps = eps
            best_min_samples = min_samples

In [None]:
print(best_eps)
print(best_min_samples)

In [None]:
# Apply DBSCAN on UMAP data
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_umap_labels = dbscan.fit_predict(umap_result_2)
labels = dbscan_umap_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["DBSCAN_UMAP_Cluster_Labels"] = labels

In [None]:
unique_labels = set(labels)

fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)

# Add colorbar
cbar = plt.colorbar(scatter, ticks=np.unique(labels))
cbar.set_label("Cluster Labels", rotation=270, labelpad=15)

ax.set_xlabel("UMAP Component 1")
ax.set_ylabel("UMAP Component 2")
ax.set_title("DBSCAN Clustering")

print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

#### Hierarchical Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Apply Hierarchical Clustering (Agglomerative Clustering) on PCA data
n_clusters = 6
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
hc_pca_labels = agg_clustering.fit_predict(pca_result_2)
labels = hc_pca_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["HC_PCA_Cluster_Labels"] = hc_pca_labels

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)

cbar = plt.colorbar(scatter, ticks=np.arange(n_clusters))
cbar.set_label("Cluster Labels", rotation=270, labelpad=15)

ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
ax.set_title("Hierarchical Clustering (Agglomerative Clustering)")

print(f"Number of clusters: {n_clusters}")
for cluster_label in range(n_clusters):
    print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

In [None]:
# Apply Hierarchical Clustering (Agglomerative Clustering) on UMAP data
n_clusters = 6
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)
hc_umap_labels = agg_clustering.fit_predict(umap_result_2)
labels = hc_umap_labels

# Add KMeans cluster labels to the original DataFrame
train_df_copy["HC_UMAP_Cluster_Labels"] = labels

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
scatter = ax.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)

cbar = plt.colorbar(scatter, ticks=np.arange(n_clusters))
cbar.set_label("Cluster Labels", rotation=270, labelpad=15)

ax.set_xlabel("PCA Component 1")
ax.set_ylabel("PCA Component 2")
ax.set_title("Hierarchical Clustering (Agglomerative Clustering)")

print(f"Number of clusters: {n_clusters}")
for cluster_label in range(n_clusters):
    print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.show()

In [None]:
# Compare the resulting labels PCA
comparison_df = pd.DataFrame(
    {
        "id": train_df_copy["id"],
        "KMeans_Labels": kmeans_pca_labels,
        "DBSCAN_Labels": dbscan_pca_labels,
        "HC_Labels": hc_pca_labels,
    }
)

In [None]:
# Compare the resulting labels UMAP
comparison_df_umap = pd.DataFrame(
    {
        "id": train_df_copy["id"],
        "KMeans_Labels": kmeans_umap_labels,
        "DBSCAN_Labels": dbscan_umap_labels,
        "HC_Labels": hc_umap_labels,
    }
)

In [None]:
comparison_df.head(10)

In [None]:
for method in ["KMeans_Labels", "DBSCAN_Labels", "HC_Labels"]:
    cluster_counts = comparison_df[method].value_counts()
    print(f"\nCluster Counts for {method}:\n{cluster_counts}")

same_labels_count = (
    comparison_df["KMeans_Labels"] == comparison_df["DBSCAN_Labels"]
) & (comparison_df["KMeans_Labels"] == comparison_df["HC_Labels"])
same1_2_labels_count = comparison_df["KMeans_Labels"] == comparison_df["DBSCAN_Labels"]
same1_3_labels_count = comparison_df["KMeans_Labels"] == comparison_df["HC_Labels"]
same2_3_labels_count = comparison_df["DBSCAN_Labels"] == comparison_df["HC_Labels"]
print(
    f"\nNumber of Elements Labeled the Same Among ALL Methods: {same_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among KMeans and DBSCAN Methods: {same1_2_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among KMeans and HC Methods: {same1_3_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among DBSCAN and HC Methods: {same2_3_labels_count.sum()}"
)

In [None]:
for method in ["KMeans_Labels", "DBSCAN_Labels", "HC_Labels"]:
    cluster_counts = comparison_df_umap[method].value_counts()
    print(f"\nCluster Counts for {method}:\n{cluster_counts}")

same_labels_count = (
    comparison_df_umap["KMeans_Labels"] == comparison_df_umap["DBSCAN_Labels"]
) & (comparison_df_umap["KMeans_Labels"] == comparison_df_umap["HC_Labels"])
same1_2_labels_count = (
    comparison_df_umap["KMeans_Labels"] == comparison_df_umap["DBSCAN_Labels"]
)
same1_3_labels_count = (
    comparison_df_umap["KMeans_Labels"] == comparison_df_umap["HC_Labels"]
)
same2_3_labels_count = (
    comparison_df_umap["DBSCAN_Labels"] == comparison_df_umap["HC_Labels"]
)
print(
    f"\nNumber of Elements Labeled the Same Among ALL Methods: {same_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among KMeans and DBSCAN Methods: {same1_2_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among KMeans and HC Methods: {same1_3_labels_count.sum()}"
)
print(
    f"\nNumber of Elements Labeled the Same Among DBSCAN and HC Methods: {same2_3_labels_count.sum()}"
)

In [None]:
plt.figure(figsize=(18, 5))

# Plot KMeans clustering
plt.subplot(1, 3, 1)
plt.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=kmeans_pca_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("KMeans Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Plot DBSCAN clustering
plt.subplot(1, 3, 2)
plt.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=dbscan_pca_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("DBSCAN Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Plot Hierarchical Clustering
plt.subplot(1, 3, 3)
plt.scatter(
    pca_result_2[:, 0],
    pca_result_2[:, 1],
    c=hc_pca_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("Hierarchical Clustering")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(18, 5))

# Plot KMeans clustering
plt.subplot(1, 3, 1)
plt.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=kmeans_umap_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("KMeans Clustering")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Plot DBSCAN clustering
plt.subplot(1, 3, 2)
plt.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=dbscan_umap_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("DBSCAN Clustering")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

# Plot Hierarchical Clustering
plt.subplot(1, 3, 3)
plt.scatter(
    umap_result_2[:, 0],
    umap_result_2[:, 1],
    c=hc_umap_labels,
    cmap="viridis",
    marker="o",
    edgecolors="k",
)
plt.title("Hierarchical Clustering")
plt.xlabel("UMAP Component 1")
plt.ylabel("UMAP Component 2")

plt.tight_layout()
plt.show()

In [None]:
columns_to_show = ["comment_text"]
train_df_copy.head(1)

In [None]:
def show_cluster_data(cluster_column):
    for cluster_label, group in train_df_copy.groupby(cluster_column):
        print(f"Cluster {cluster_label}:\n")
        for _, row in group.head(5).iterrows():
            print(row[columns_to_show])  # Display the value in the specified column
            print()
        print("-" * 50)  # Separator

In [None]:
show_cluster_data("KMeans_PCA_Cluster_Labels")

In [None]:
show_cluster_data("DBSCAN_PCA_Cluster_Labels")

In [None]:
show_cluster_data("HC_PCA_Cluster_Labels")