In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
df_train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', index_col='id')

df_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip', index_col='id')


In [None]:
pd.set_option('display.max_rows', None)


In [None]:
df = pd.concat([df_train, df_test])


In [None]:
df_train['comment_text'] = df_train['comment_text'].str.lower()
df_test['comment_text'] = df_test['comment_text'].str.lower()


In [None]:
df_train.info()


In [None]:
import re

def remove_special_characters(text):
    text = re.sub(r'http\S+', ' ', text )
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\bhttps?://[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)+\b', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d', ' ', text)  # Corrected line
    text= re.sub(r'[\u4e00-\u9fff]+', ' ', text)
    return text

df_train['comment_text'] = df_train['comment_text'].apply(remove_special_characters)
df_test['comment_text'] = df_test['comment_text'].apply(remove_special_characters)

print(df_train['comment_text'].head(100))


In [None]:
import string 
from nltk import word_tokenize

df_train['tokens'] = df_train['comment_text'].apply(word_tokenize)
df_test['tokens'] = df_test['comment_text'].apply(word_tokenize)


In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df_train, train_size=0.8, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

vec = TfidfVectorizer(ngram_range=(1, 2), 
                      min_df=3, 
                      max_df=0.9, 
                      strip_accents='unicode', 
                      use_idf=1,
                      smooth_idf=1, 
                      sublinear_tf=1,
                      binary=1,
                      stop_words='english')
trn_term_doc = vec.fit_transform(df_train['comment_text'])
val_term_doc = vec.transform(valid['comment_text'])
test_term_doc = vec.transform(df_test['comment_text'])


In [None]:
x = trn_term_doc
val_x = val_term_doc


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [None]:
epsilon = 1e-9  # Define epsilon as a small positive constant


In [None]:
# Define a function to calculate the probability of each word given a specific class (toxic or non-toxic)
def probability(y_i, y):
    # Sum the occurrences of each word in comments labeled with y_i (1 for toxic, 0 for non-toxic)
    occurences = x[y == y_i].sum(0)
    # Add a smoothing factor of 1 to avoid division by zero and handle words not present in some classes
    
    return (occurences + 1) / ((y == y_i).sum() + 1)


In [None]:
# Define a function to train a logistic regression model for binary classification (toxic or non-toxic)
def get_model(y):
    # Convert the target labels to a numpy array
    y = y.values
    # Calculate the log-ratio of probabilities of each word being toxic vs. non-toxic
    loga = np.log((probability(1, y) + epsilon) / (probability(0, y) + epsilon) )
    # Multiply the input features by the log-ratio to incorporate the information about word toxicity
    x_loga = x.multiply(loga)
    # Initialize a naive bayes model with specified hyperparameters
    model = LogisticRegression(C=1.0,  # Regularization parameter
                                    penalty='l2',  # Penalty term ('l1' or 'l2')
                                    solver='liblinear',  # Optimization algorithm
                                    max_iter=100,  # Maximum number of iterations
                                    random_state=42)
    
    # Fit the model to the modified input features and target labels
    return model.fit(x_loga, y), loga


In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = df_train.drop([ 'comment_text'], axis = 1)
valid_labels = valid.drop([ 'comment_text'], axis = 1)


In [None]:
  # Dictionary to store ROC AUC scores for each class
model = {}
ROC_AUC_Scores = {}
for i, col in enumerate(classes):
    print(col)

    # Train model for current class
    model_trained, loga = get_model(train_labels[col])
    model[col] = (model_trained, loga)
    # Make predictions on validation set
    preds = model_trained.predict(val_x.multiply(loga)).reshape(-1, 1)

    # Calculate ROC AUC score for current class and store it
    roc_auc = roc_auc_score(valid_labels[col], preds)
    ROC_AUC_Scores[col] = roc_auc
    # Print ROC AUC scores for each class
for col, roc_auc in ROC_AUC_Scores.items():
    print(f"ROC AUC for class: '{col}': {roc_auc}")


In [None]:
preds = np.zeros((len(df_test), len(classes)))

for i, col in enumerate(classes):
    print(col)
    preds[:, i] = model[col][0].predict_proba(test_term_doc.multiply(model[col][1]))[:, 1]


In [None]:
submid = pd.DataFrame({'id': df_test.index})  # Use index as 'id' column
submission = pd.concat([submid, pd.DataFrame(preds, columns=classes)], axis=1)
submission.to_csv('submission.csv', index=False)


In [None]:
submission.head()
