In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
columns_base = ['id', 'comment_text']
columns_type = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
columns_all = columns_base + columns_type
train_df = pd.read_csv('./input/train.csv', usecols=columns_all)
test_df = pd.read_csv('./input/test.csv', usecols=columns_base)

In [None]:
# CONSTANTS
NUM_OF_ROWS = 10_000
RANDOM_SAMPLE = False
USE_TEST_DATASET = False
RUN_FULL_PCA = False

# EDA

In [None]:
train_df.head(5)

In [None]:
type_count = train_df[columns_type].sum()
total_samples = len(train_df)
type_percentage = (type_count / total_samples) * 100
print("Size of train dataset:")
print(train_df.shape)

rows_with_all_zeros = train_df[(train_df[columns_type] == 0).all(axis=1)]
print("\nCount of rows with all 0 types:", len(rows_with_all_zeros))

percentage_nonzero_types = 1 - (len(rows_with_all_zeros) / len(train_df))
print("\nPercentage of rows with at least one non-zero type: {:.2%}".format(percentage_nonzero_types))

class_summary = pd.DataFrame({'Count': type_count, 'Percentage': type_percentage})
class_summary['Percentage'] = class_summary['Percentage'].map('{:.2f}%'.format)
print("\nSum for each type with added value, percentage and labels:")
print(class_summary)

In [None]:
comments_category = pd.DataFrame({
    'Category': ['Good Comments', 'Bad Comments'],
    'Count': [len(rows_with_all_zeros), len(train_df) - len(rows_with_all_zeros)]
})

plt.figure(figsize=(8, 8))
plt.pie(comments_category['Count'], labels=comments_category['Category'], autopct='%1.2f%%', startangle=140)
plt.title('Distribution of Good and Bad Comments')
plt.show()

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
type_counts = {}
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    count = mask.sum()
    type_counts[text_type] = count
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat([selected_rows_df, first_appearance], ignore_index=True)

print("Count of comments where only a specific type has 1 and others are 0:")
for text_type, count in type_counts.items():
    print(f"{text_type}: {count}")

In [None]:
selected_rows_df = pd.DataFrame(columns=columns_all)
for text_type in columns_type:
    mask = (train_df[text_type] == 1) & (train_df[columns_type].sum(axis=1) == 1)
    first_appearance = train_df[mask].head(1)
    selected_rows_df = pd.concat([selected_rows_df, first_appearance], ignore_index=True)

In [None]:
with pd.option_context('display.max_colwidth', None):
  display(selected_rows_df)