In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
import re
import nltk
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize.regexp import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression


In [None]:
stop_words = stopwords.words('english')


In [None]:
pd.options.display.max_columns = None


In [None]:
df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')


In [None]:
# оставляю только буквы и цифры
df['clean_text'] = df.comment_text.apply(
    lambda row: re.sub(r'[^a-z0-9]+', ' ', row.lower()).strip()
)


In [None]:
stemmer = SnowballStemmer("english")
tokenizer = RegexpTokenizer(r'\w{2,}')

def preprocessing(text):
    new_words = tokenizer.tokenize(text)
    new_list = []
    for w in new_words:
        if w not in stop_words:
            w = stemmer.stem(w)
            new_list.append(w)
    new_list = ' '.join(new_list)
    return new_list


In [None]:
corpus = [preprocessing(text) for text in tqdm(df.clean_text)]


In [None]:
df['corpus'] = corpus


In [None]:
df.sample(5)


In [None]:
# features = corpus
# target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult']


In [None]:
# for col in target_cols:
#     df.loc[df[col] == 1, 'target'] = col

# df['target'].fillna(0, inplace=True)


In [None]:
X_train, X_test = train_test_split(
    df['corpus'], test_size = 0.4, random_state = 2024
) 

X_valid, X_test = train_test_split(
    X_test, test_size = 0.5, random_state = 2024
) 

print('train size', X_valid.shape)
print('test size', X_test.shape)
print('train size', X_train.shape)


In [None]:
vec = TfidfVectorizer(stop_words=stop_words)
features_train = vec.fit_transform(X_train)
features_test = vec.transform(X_test)
features_valid = vec.transform(X_valid)


In [None]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [None]:
for col in target_cols:
    check_proportion_df = pd.concat([
    df[col].value_counts(normalize=True), 
        df.loc[X_train.index, col].value_counts(normalize=True),
        df.loc[X_test.index, col].value_counts(normalize=True)
    ], axis=1)

    check_proportion_df.columns = ['full', 'train', 'test']
    
    print(col)
    display(check_proportion_df)
    print('\n'*2)


In [None]:
models_list = []
for col in tqdm(target_cols):
    clf = LogisticRegression(class_weight='balanced', n_jobs=-1)
    clf.fit(features_train, df.loc[X_train.index, col])
    
    models_list += [(col, clf)]


In [None]:
for item in models_list:
    col, clf = item
    print(col)
    print(f'train f1: {f1_score(df.loc[X_train.index, col], clf.predict(features_train))}')
    print(f'test f1: {f1_score(df.loc[X_test.index, col], clf.predict(features_test))}')
    print(f'valid f1: {f1_score(df.loc[X_valid.index, col], clf.predict(features_valid))}' + '\n'*2)


In [None]:
full_features = vec.fit_transform(df['corpus'])

models_list = []
for col in tqdm(target_cols):
    clf = LogisticRegression(class_weight='balanced', n_jobs=-1)
    clf.fit(full_features, df[col])
    models_list += [(col, clf)]


In [None]:
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')


In [None]:
# оставляю только буквы и цифры
test['clean_text'] = test.comment_text.apply(
    lambda row: re.sub(r'[^a-z0-9]+', ' ', row.lower()).strip()
)


In [None]:
test['corpus'] = [preprocessing(text) for text in tqdm(test.clean_text)]


In [None]:
corpus = vec.transform(test['corpus'])


In [None]:
for item in models_list:
    col, clf = item
    test[col] = clf.predict_proba(corpus)[:, 1]


In [None]:
test.sample(10)


In [None]:
test[['id'] + target_cols].to_csv('submission_combined.csv', index=False)
