In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
import joblib
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from pandarallel import pandarallel

sys.path.append('../')
pandarallel.initialize(progress_bar=True)

In [None]:
train_df = pd.read_csv('../input/train.csv', usecols=['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
test_df = pd.read_csv('../input/test.csv', usecols=['id', 'comment_text'])

# Rename columns in the DataFrame
columns_base = ['ID', 'Comment_Text']
columns_type = ['Is_Toxic', 'Is_Severe_Toxic', 'Is_Obscene', 'Is_Threat', 'Is_Insult', 'Is_Identity_Hate']
columns_all = columns_base + columns_type
train_df.columns = columns_all
test_df.columns = columns_base

In [None]:
# CONSTANTS
NUM_OF_ROWS = 10_000
RANDOM_SAMPLE = False
USE_TEST_DATASET = False
RUN_FULL_PCA = False

# EDA

In [None]:
train_df.head(5)

In [None]:
type_count = train_df[columns_type].sum()
total_samples = len(train_df)
type_percentage = (type_count / total_samples) * 100
print("Size of train dataset:")
print(train_df.shape)

rows_with_all_zeros = train_df[(train_df[columns_type] == 0).all(axis=1)]
print("\nCount of rows with all 0 types:", len(rows_with_all_zeros))

percentage_nonzero_types = 1 - (len(rows_with_all_zeros) / len(train_df))
print("\nPercentage of rows with at least one non-zero type: {:.2%}".format(percentage_nonzero_types))

class_summary = pd.DataFrame({'Count': type_count, 'Percentage': type_percentage})
class_summary['Percentage'] = class_summary['Percentage'].map('{:.2f}%'.format)
print("\nSum for each type with added value, percentage and labels:")
print(class_summary)

In [None]:
comments_category = pd.DataFrame({
    'Category': ['Good Comments', 'Bad Comments'],
    'Count': [len(rows_with_all_zeros), len(train_df) - len(rows_with_all_zeros)]
})

plt.figure(figsize=(8, 8))
plt.pie(comments_category['Count'], labels=comments_category['Category'], autopct='%1.2f%%', startangle=140)
plt.title('Distribution of Good and Bad Comments')
plt.show()

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
type_counts = {}
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    count = mask.sum()
    type_counts[text_type] = count
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat([selected_rows_df, first_appearance], ignore_index=True)

print("Count of comments where only a specific type has 1 and others are 0:")
for text_type, count in type_counts.items():
    print(f"{text_type}: {count}")

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat([selected_rows_df, first_appearance], ignore_index=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
  display(selected_rows_df)

# Preprocessing

In [None]:
# Filter all hate comments for model training
hate_comments_df = train_df[train_df[columns_type].any(axis=1)].copy().reset_index(drop=True)

# Filter the same amount of good comments for model training
good_comments_df = train_df[train_df[columns_type].eq(0).all(axis=1)].sample(n=len(hate_comments_df),random_state=42).copy().reset_index(drop=True)

# Concatenate 50% hate and 50% good comments and shuffle
train_df_copy = pd.concat([hate_comments_df, good_comments_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
train_df_copy.shape

In [None]:
from src.preprocessing import do_preprocessing

train_df_copy['Comment_Text_Preprocessed'] = train_df_copy["Comment_Text"].parallel_apply(lambda d: " ".join(do_preprocessing(d)))

In [None]:
X = train_df_copy['Comment_Text_Preprocessed']
y = train_df_copy[columns_type]

tfidf_vectorizer = TfidfVectorizer(max_features=10_000, max_df=0.9, smooth_idf=True, use_idf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(X)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

In [None]:
# Save tfidf_vectorizer
joblib.dump(tfidf_vectorizer, './models/tfidf_vectorizer.joblib')

# Load tfidf_vectorizer
# tfidf_vectorizer = joblib.load('./models/tfidf_vectorizer.joblib')

In [None]:
train_df_copy_tfidf = pd.concat([train_df_copy, tfidf_df], axis=1)
print(train_df_copy.shape)
print(train_df_copy_tfidf.shape)
print(f"Unique words count: {len(feature_names)}")

In [None]:
# Display the top 100 most popular words
top_100_words = tfidf_df.sum().sort_values(ascending=False).head(100)
print(top_100_words.to_string())

In [None]:
# Check for any non numeric values in the features dataframe
tfidf_features = train_df_copy_tfidf[feature_names]
numeric_df = tfidf_features.apply(pd.to_numeric, errors='coerce')
nan_values = numeric_df.isna().sum().sum()

if nan_values == 0:
    print("All values in the DataFrame are numeric.")
else:
    print(f"There are {nan_values} non-numeric values in the DataFrame.")

# PCA

In [None]:
from sklearn.decomposition import PCA

n_components = 2
pca_2 = PCA(n_components=n_components)
pca_result_2 = pca_2.fit_transform(tfidf_features)
pca_result_df_2 = pd.DataFrame(data=pca_result_2, columns=[f'PCA_{i + 1}' for i in range(n_components)])

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(pca_result_2[:, 0], pca_result_2[:, 1], alpha=0.5)
plt.title('2D Scatter Plot of PCA Components')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

In [None]:
print(pca_2.explained_variance_ratio_)

In [None]:
n_components = 3
pca_3 = PCA(n_components=n_components)
pca_result_3 = pca_3.fit_transform(tfidf_features)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(pca_result_3[:, 0], pca_result_3[:, 1], pca_result_3[:, 2], c='blue', marker='o', edgecolors='k')
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D PCA Plot')
plt.show()

In [None]:
print(pca_3.explained_variance_ratio_)

In [None]:
# Calculate PCA with 0.95 explained variance
# pca = PCA(0.95)
# pca_result = pca.fit_transform(tfidf_features)
# exp_var_pca = pca.explained_variance_ratio_
# cum_sum_eigenvalues = np.cumsum(exp_var_pca)

In [None]:
# plt.bar(range(0, len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
# plt.step(range(0, len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid', label='Cumulative explained variance')
# plt.ylabel('Explained variance ratio')
# plt.xlabel('Principal component index')
# plt.legend(loc='best')
# plt.tight_layout()
# 
# print(f"Number of components for 0.95 explained variance: {len(cum_sum_eigenvalues)}")
# plt.show()

# Clustering

In [None]:
from sklearn.cluster import KMeans
n_clusters = 7

# Apply KMeans on UMAP data
kmeans_pca = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans_pca_labels = kmeans_pca.fit_predict(pca_result_2)
labels = kmeans_pca_labels

In [None]:
# Visualize the clustering results
scatter = plt.scatter(pca_result_2[:, 0], pca_result_2[:, 1], c=kmeans_pca_labels, cmap='viridis', marker='o', edgecolors='k')
plt.scatter(kmeans_pca.cluster_centers_[:, 0], kmeans_pca.cluster_centers_[:, 1], s=200, c='red', marker='X',label='Centroids')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering')
plt.legend()

unique_labels = set(labels)
print("Number of clusters:", len(unique_labels) - (1 if -1 in unique_labels else 0))
for cluster_label in unique_labels:
    if cluster_label == -1:
        print(f"Noise points: {sum(labels == cluster_label)}")
    else:
        print(f"Cluster {cluster_label}: {sum(labels == cluster_label)} points")

plt.colorbar(scatter)
plt.show()

# Model Training

In [None]:
X.head()

In [None]:
y.head()

In [None]:
class_labels = ['Toxic', 'Severe_Toxic', 'Obscene', 'Threat', 'Insult', 'Identity_Hate']

In [None]:
# Split Dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
X_test_tfidf.shape

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
clf = MultiOutputClassifier(lr)
clf = clf.fit(X_train_tfidf, y_train)

In [None]:
# Save model
joblib.dump(clf, './models/linear_regression_classifier_model.joblib')

# Load model
# clf = joblib.load('./models/linear_regression_classifier_model.joblib')

In [None]:
# Get the class labels for each classifier
for i, estimator in enumerate(clf.estimators_):
    print(f"Classifier {i + 1} Class Labels:", estimator.classes_)

In [None]:
prediction = clf.predict(X_test_tfidf)

In [None]:
from sklearn.metrics import accuracy_score
print('Accuracy Score: ', accuracy_score(y_test, prediction))

In [None]:
from sklearn.metrics import hamming_loss
print('Hamming Loss: ', round(hamming_loss(y_test, prediction),2))

In [None]:
sample_text = ["some toxic text"]
sample_text_tfidf = tfidf_vectorizer.transform(sample_text)
sample_text_pred_prob = clf.predict_proba(sample_text_tfidf)
prediction_df = pd.DataFrame()
for i, output_name in enumerate(class_labels):
    prediction_df[output_name] = sample_text_pred_prob[i][:, 1]

In [None]:
prediction_df

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, classification_report

base_classifier = RandomForestClassifier(random_state=42)
multi_output_classifier = MultiOutputClassifier(base_classifier)
multi_output_classifier = multi_output_classifier.fit(X_train_tfidf, y_train)

In [42]:
# Save model
joblib.dump(multi_output_classifier, './models/random_forrest_classifier_model.joblib')

# Load model
# multi_output_classifier = joblib.load('./models/random_forrest_classifier_model.joblib')

['./models/random_forrest_classifier_model.joblib']

In [43]:
y_pred = multi_output_classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report_str)

Accuracy: 0.6400616332819723
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.79      0.84      3100
           1       0.33      0.05      0.08       311
           2       0.86      0.77      0.81      1652
           3       0.77      0.09      0.17       106
           4       0.74      0.61      0.67      1550
           5       0.63      0.21      0.31       293

   micro avg       0.84      0.68      0.75      7012
   macro avg       0.70      0.42      0.48      7012
weighted avg       0.81      0.68      0.73      7012
 samples avg       0.35      0.33      0.33      7012



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
prediction_probabilities = multi_output_classifier.predict_proba(X_test_tfidf)
prediction_df = pd.DataFrame()
for i, output_name in enumerate(class_labels):
    prediction_df[output_name] = prediction_probabilities[i][:, 1]

In [45]:
prediction_df.head()

Unnamed: 0,Toxic,Severe_Toxic,Obscene,Threat,Insult,Identity_Hate
0,1.0,0.158024,0.99,0.0,0.91,0.38
1,0.78,0.1,0.603333,0.01,0.57,0.09
2,0.37,0.0,0.03,0.02,0.02,0.07
3,0.099287,0.00013,0.042059,0.0,0.044141,0.000707
4,0.9,0.11,0.84,0.03,0.613333,0.06


In [46]:
sample_text = ["some toxic text"]
sample_text_tfidf = tfidf_vectorizer.transform(sample_text)
sample_text_pred_prob = multi_output_classifier.predict_proba(sample_text_tfidf)
prediction_df = pd.DataFrame()
for i, output_name in enumerate(class_labels):
    prediction_df[output_name] = sample_text_pred_prob[i][:, 1]

In [47]:
prediction_df

Unnamed: 0,Toxic,Severe_Toxic,Obscene,Threat,Insult,Identity_Hate
0,0.112715,0.000984,0.10756,0.0,0.009571,0.001424


# Prediction

In [48]:
columns_submission = ['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [49]:
test_df.head()

Unnamed: 0,ID,Comment_Text
0,0001ea8717f6de06,Thank you for understanding. I think very high...
1,000247e83dcc1211,:Dear god this site is horrible.
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig..."
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ..."
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l..."


In [50]:
# Good comment
test_df.loc[test_df['ID'] == '00177176f33f587e']

Unnamed: 0,ID,Comment_Text
28,00177176f33f587e,== Can you work your magic? == \n\n Hi. I was...


In [51]:
# Bad comment
test_df.loc[test_df['ID'] == '0013fed3aeae76b7']

Unnamed: 0,ID,Comment_Text
24,0013fed3aeae76b7,DJ Robinson is gay as hell! he sucks his dick ...


In [52]:
test_text_tfidf = tfidf_vectorizer.transform(test_df['Comment_Text'])

In [53]:
test_text_tfidf_prob = clf.predict_proba(test_text_tfidf)
prediction_df = pd.DataFrame()
for i, output_name in enumerate(class_labels):
    prediction_df[output_name] = test_text_tfidf_prob[i][:, 1]

In [54]:
result_df = pd.concat([test_df['ID'], prediction_df], axis=1)
result_df.columns = columns_submission
result_df.to_csv('../output/submission.csv', index=False)
result_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0.209416,0.009946,0.037403,0.005073,0.050169,0.011861
1,000247e83dcc1211,0.764104,0.022871,0.19559,0.010782,0.284611,0.024403
2,0002f87b16116a7f,0.460721,0.010187,0.088454,0.007393,0.089566,0.026021
3,0003e1cccfd5a40a,0.097302,0.011725,0.08325,0.006446,0.073538,0.013481
4,00059ace3e3e9a53,0.069654,0.007066,0.0494,0.004337,0.046697,0.007273


In [55]:
test_text_tfidf_prob = multi_output_classifier.predict_proba(test_text_tfidf)
prediction_df = pd.DataFrame()
for i, output_name in enumerate(class_labels):
    prediction_df[output_name] = test_text_tfidf_prob[i][:, 1]

In [56]:
result_df = pd.concat([test_df['ID'], prediction_df], axis=1)
result_df.columns = columns_submission
result_df.to_csv('../output/submission_random_forrest.csv', index=False)
result_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0.07,0.0,0.01,0.0,0.04,0.0
1,000247e83dcc1211,0.442877,0.000405,0.191429,0.01,0.201587,0.021954
2,0002f87b16116a7f,0.28,0.0,0.06,0.0,0.06,0.0
3,0003e1cccfd5a40a,0.14,0.0,0.05,0.0,0.02,0.0
4,00059ace3e3e9a53,0.06,0.0,0.0,0.0,0.03,0.0
