In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Version 1.55: Bag of Words
#Try keeping all features again to see how
#So far in order of decreasing score we have 5000 > 4000 > 3000 > 1000 > 2000 features


In [3]:
## Import

#Basic
import numpy as np
import pandas as pd

#Data Preprocessing
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import re
import nltk
import nltk.data
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

#Models
from sklearn.ensemble import RandomForestClassifier
import catboost
from catboost import CatBoost

#Tuning
from sklearn.metrics import hamming_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

#Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

#Settings
pd.set_option('display.max_rows',None)


In [4]:
## Upload Data
train = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')


In [5]:
train.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
train['comment_text'][0]


"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [7]:
## Data Preprocess Pipeline Function
def preprocess_text(comment):
#     print('Step0', comment)
    
    #1. Remove HTML tags with Beautiful Soup
    processed_comment = BeautifulSoup(comment)
#     print('Step1', processed_comment)
    
    #2. Remove punctuation
    processed_comment = re.sub('[^a-zA-Z]', ' ', processed_comment.get_text())
    processed_comment = re.sub('[\n]', ' ', processed_comment)
#     print('Step2', processed_comment)

    #3. Convert all letters to lowercase
    processed_comment = processed_comment.lower()
#     print('Step3', processed_comment)
    
    #4. Convert comment into array of word strings
    processed_comment = processed_comment.split()
#     print('Step4', processed_comment)
    
    #5. Remove stopwords such as 'a' and 'the'
    stops = set(stopwords.words('english'))
    processed_comment = [w for w in processed_comment if w not in stops]
    
    #6. Split comment into a paragraph string
    return (' ').join(processed_comment)

preprocess_text(train['comment_text'][0])


'explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired'

In [8]:
## Preprocess Train and Test Dataset
cleaned_train_reviews = []
cleaned_test_reviews = []

# for i in range(0,100):
for i in range(0,len(train)):
    if i%10000 == 0:
        print('Review %d of %d processed' % (i,len(train)))
    cleaned_review = preprocess_text(train['comment_text'][i])
    cleaned_train_reviews.append(cleaned_review)
    
# for i in range(0,100):
for i in range(0,len(test)):
    if i%10000 == 0:
        print('Review %d of %d processed' % (i,len(test)))
    cleaned_review = preprocess_text(test['comment_text'][i])
    cleaned_test_reviews.append(cleaned_review)


Review 0 of 159571 processed
Review 10000 of 159571 processed
Review 20000 of 159571 processed
Review 30000 of 159571 processed
Review 40000 of 159571 processed
Review 50000 of 159571 processed
Review 60000 of 159571 processed



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  processed_comment = BeautifulSoup(comment)


Review 70000 of 159571 processed
Review 80000 of 159571 processed
Review 90000 of 159571 processed
Review 100000 of 159571 processed
Review 110000 of 159571 processed
Review 120000 of 159571 processed
Review 130000 of 159571 processed
Review 140000 of 159571 processed
Review 150000 of 159571 processed
Review 0 of 153164 processed
Review 10000 of 153164 processed
Review 20000 of 153164 processed
Review 30000 of 153164 processed
Review 40000 of 153164 processed
Review 50000 of 153164 processed
Review 60000 of 153164 processed
Review 70000 of 153164 processed
Review 80000 of 153164 processed
Review 90000 of 153164 processed
Review 100000 of 153164 processed
Review 110000 of 153164 processed
Review 120000 of 153164 processed
Review 130000 of 153164 processed
Review 140000 of 153164 processed
Review 150000 of 153164 processed


In [9]:
for i in range(5):
    print('-', cleaned_train_reviews[i])


- explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired
- aww matches background colour seemingly stuck thanks talk january utc
- hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info
- make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport
- sir hero chance remember page


In [10]:
## Create Bag of Words Counter
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
#                             ngram_range = (1,2),
                            max_features = 5000)

hello = vectorizer.fit_transform(cleaned_train_reviews)
train_data_features = (vectorizer.fit_transform(cleaned_train_reviews)).toarray()
test_data_features = (vectorizer.fit_transform(cleaned_test_reviews)).toarray()
print(hello[0].indices)


[1616 1430 2663 4714 2021 1675 3800 1881 4805 2945 4986 1639 3300 3703
 4438 4401 3147 4075 3786]


In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

for i in range(20):
    print(vectorizer.get_feature_names_out()[i])


aa
ability
able
abortion
absence
absolute
absolutely
abstract
absurd
abuse
abused
abusing
abusive
ac
academic
academy
accept
acceptable
accepted
accepting


In [12]:
# vocabulary = vectorizer.get_feature_names_out()
# first_word = vocabulary[0]

# print(f'The word in the first column is: {first_word}')
# print(vectorizer.vocabulary_)


In [13]:
# Create Cross-Validation Sets
y = train.copy()
drop_columns = ['id','comment_text']
y = (y.drop(columns=drop_columns, axis=0)).astype(int)

# X_cv_train, X_cv_test, y_cv_train, y_cv_test = train_test_split(train_data_features, y, test_size=0.2, random_state=0)


In [14]:
#Label Count
# label_count = np.sum(y, axis=0)
# print(label_count)


In [15]:
# CV Modeling

# df_X_cv_train = (pd.DataFrame(X_cv_train)).astype(int)
# df_X_cv_test = (pd.DataFrame(X_cv_test)).astype(int)

# model = catboost.CatBoostClassifier(loss_function='MultiCrossEntropy',iterations=100, random_seed=0,verbose=False)
# model.fit(df_X_cv_train, y_cv_train)
# y_pred = model.predict(df_X_cv_test)


In [16]:
# CV Modeling Scores

# array_y_cv_test = y_cv_test.to_numpy()

# hamming_score = 1 - hamming_loss(array_y_cv_test, y_pred)
# roc = roc_auc_score(y_cv_test,y_pred,average='weighted',multi_class='ovr')
# f1 = f1_score(y_cv_test,y_pred,average='weighted')

# print('Hammings Score is ' + str(format(hamming_score, '.5f')))
# print('ROC Score is ' + str(format(roc, '.5f')))
# print('F1 Score is ' + str(format(f1, '.5f')))


In [17]:
# # Feature importance modeling with RandomForestClassifier
# model = RandomForestClassifier()
# model.fit(train_data_features, y)
# feature_importances = model.feature_importances_


In [18]:
# # See all index-word-feature importance
# feature_to_index_mapping = vectorizer.vocabulary_ #word is key, index is value
# feature_to_word_mapping = list(vectorizer.get_feature_names_out())
# dict_Index_to_FeatureImport = {}

# for i in range(len(feature_importances)):
#     word = feature_to_word_mapping[i]
#     index = feature_to_index_mapping[word]
#     dict_Index_to_FeatureImport[index] = feature_importances[i]
# #     print(f'Index: {index} | Word: {word} | Importance: {feature_importances[i]}')


In [19]:
# # Select Top 1000 Features
# selected_feature_importances = []

# all_feature_importances = list(feature_importances)
# sorted_all_feature_importances = sorted(all_feature_importances, reverse=True)
# for i in range(4001):
#     selected_feature_importances.append(sorted_all_feature_importances[i])


In [20]:
# ## Create List to Drop the Other 4000 Columns
# removed_features = []
# for index, feature_importance in dict_Index_to_FeatureImport.items():
#     if feature_importance not in selected_feature_importances:
#         removed_features.append(index)
# print(len(removed_features))


In [21]:
# CV Scores Log
    ## Version 1.4
# Hammings Score is 0.97980
# ROC Score is 0.78235
# F1 Score is 0.66949


In [22]:
# See all words
# vocab = vectorizer.vocabulary_.keys()
# print(vocab)

# indexes = vectorizer.vocabulary_.values()
# for word, index in zip(vocab,indexes):
#     if index in yolo:
#         print(word, index)

# See counts of all words
# dist = np.sum(train_data_features,axis=0)
# print(dist)

# for word, count in zip(vocab,dist):
#     print(word,count)


In [23]:
## Modelling
df_train_data_features = (pd.DataFrame(train_data_features)).astype(int)
df_test_data_features = (pd.DataFrame(test_data_features)).astype(int)

# Drop 4000 features
# df_train_data_features = df_train_data_features.drop(columns=removed_features)

model = catboost.CatBoostClassifier(loss_function='MultiCrossEntropy',iterations=100, random_seed=0,verbose=False)
model.fit(df_train_data_features,y)
predictions = model.predict_proba(df_test_data_features)
# print(predictions)


In [24]:
predictions = pd.DataFrame(predictions)
output = pd.DataFrame(data={'id': test['id'],'toxic': predictions[0], 'severe_toxic': predictions[1],
                           'obscene': predictions[2], 'threat': predictions[3], 'insult': predictions[4],
                           'identity_hate': predictions[5]})
output.to_csv('submission.csv', index=False)
