In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer
import string
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestRegressor

In [2]:
df_train = pd.read_csv("../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv")

In [3]:
df_train.shape

(159571, 8)

In [4]:
stop = stopwords.words('english')
punctuation = string.punctuation

In [5]:
def remove_stopwords_sentence(sentence):
    # print(sentence)
    return pd.Series([word for word in sentence[0].split() if word not in stop])


def remove_stopwords_df(df):
    return df.apply(remove_stopwords_sentence, axis=1)


def stem_sentence(s):
    stemmer = EnglishStemmer()
    return pd.Series([stemmer.stem(w) for w in s if not pd.isna(w)]).to_frame().apply(' '.join, axis=0)[0]

In [6]:
prep_pipeline = Pipeline(steps=[
    ('remove \n', FunctionTransformer(pd.DataFrame.replace, kw_args={'to_replace':'\n', 'value':' ', 'regex': True})),
    ('remove numbers', FunctionTransformer(pd.DataFrame.replace, kw_args={'to_replace':'\d', 'value':'', 'regex': True})),
    ('remove html tags', FunctionTransformer(pd.DataFrame.replace,
                                             kw_args={'to_replace': '<.*?>', 'value': '', 'regex': True})),
    ('lower', FunctionTransformer(lambda x: x.squeeze(axis=1).str.lower().to_frame())),
    ('remove punctuation', FunctionTransformer(lambda x: x.squeeze(
        axis=1).str.replace('[{}]'.format(punctuation), '').to_frame())),
    ('remove stopwords', FunctionTransformer(remove_stopwords_df, validate=False)),
    ('stemming', FunctionTransformer(pd.DataFrame.apply, kw_args={
     'func': stem_sentence, 'axis': 1}, validate=False)),
    #('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('vectorizer', TfidfVectorizer(lowercase=False,analyzer='word', preprocessor=None, tokenizer=lambda i:i.split() ))
])

In [7]:
target = df_train.toxic + df_train.obscene + df_train.insult + df_train.threat * 1.5 + df_train.severe_toxic * 2 + df_train.identity_hate*2

In [8]:
df_train['target'] = target

In [9]:
validation_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')

In [10]:
comments_to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

In [11]:
# Create the folds and save them
n_folds = 7
n = (df_train['target'] > 0).sum()
for fld in range(n_folds):
    print(f'Fold: {fld}')
    tmp_df = pd.concat([df_train[df_train.target>0].sample(frac=1, random_state = 10*(fld+1)) , 
                        df_train[df_train.target==0].sample(n,random_state = 10*(fld+1))], axis=0)

    tmp_df.to_csv('./df'+str(fld)+'.csv', index=False)
    print(tmp_df.shape)
    print(tmp_df['target'].value_counts())
    

Fold: 0
(32450, 9)
0.0    16225
1.0     6284
3.0     4008
2.0     3208
5.0     1610
4.0      359
7.0      265
4.5      149
6.5      121
2.5      118
8.5       31
3.5       29
1.5       22
6.0       13
5.5        8
Name: target, dtype: int64
Fold: 1
(32450, 9)
0.0    16225
1.0     6284
3.0     4008
2.0     3208
5.0     1610
4.0      359
7.0      265
4.5      149
6.5      121
2.5      118
8.5       31
3.5       29
1.5       22
6.0       13
5.5        8
Name: target, dtype: int64
Fold: 2
(32450, 9)
0.0    16225
1.0     6284
3.0     4008
2.0     3208
5.0     1610
4.0      359
7.0      265
4.5      149
6.5      121
2.5      118
8.5       31
3.5       29
1.5       22
6.0       13
5.5        8
Name: target, dtype: int64
Fold: 3
(32450, 9)
0.0    16225
1.0     6284
3.0     4008
2.0     3208
5.0     1610
4.0      359
7.0      265
4.5      149
6.5      121
2.5      118
8.5       31
3.5       29
1.5       22
6.0       13
5.5        8
Name: target, dtype: int64
Fold: 4
(32450, 9)
0.0    16225
1.0 

In [12]:
# apply a model on each fold
n_folds = 1
n = (df_train['target'] > 0).sum()
val_preds_arr1 = np.zeros((validation_data.shape[0], n_folds))
val_preds_arr2 = np.zeros((validation_data.shape[0], n_folds))
test_preds_arr = np.zeros((comments_to_score.shape[0], n_folds))

for fld in range(n_folds):
    print("\n\n")
    print(f' ****************************** FOLD: {fld} ******************************')
    
    
    print(f'Fold: {fld}')
    #tmp_df = pd.concat([df_train[df_train.target>0].sample(frac=1, random_state = 10*(fld+1)) , 
                       # df_train[df_train.target==0].sample(n,random_state = 10*(fld+1))], axis=0)
    
    df = pd.read_csv('./df'+str(fld)+'.csv')
    
    #df = pd.read_csv('D:/OP/P8/df_fld'+str(fld)+'.csv')
    print(df.shape)

    features = FeatureUnion([
        ("tfidf", TfidfVectorizer(lowercase=False)),
    ])
    pipeline = Pipeline(
        [
            ('prep', prep_pipeline),
            #("features", features),
            ("clf", RandomForestRegressor(n_estimators = 10)),
            #("clf", Ridge()),
            #("clf",LinearRegression())
        ]
    )
    print("\nTrain:")
    # Train the pipeline
    pipeline.fit(df['comment_text'].to_frame(), df['target'])
    
     # What are the important features for toxicity


    feature_wts = sorted(list(zip(pipeline['prep']['vectorizer'].get_feature_names(), 
                                  np.round(pipeline['clf'].feature_importances_,2) )), 
                         key = lambda x:x[1], 
                         reverse=True)

    print(feature_wts[:30])
    
    print("\npredict validation data ")
    val_preds_arr1[:,fld] = pipeline.predict(validation_data['less_toxic'].to_frame())
    val_preds_arr2[:,fld] = pipeline.predict(validation_data['more_toxic'].to_frame())

    print("\npredict test data ")
    test_preds_arr[:,fld] = pipeline.predict(comments_to_score['text'].to_frame())




 ****************************** FOLD: 0 ******************************
Fold: 0
(32450, 9)

Train:


  
  This is separate from the ipykernel package so we can avoid doing imports until
  if sys.path[0] == '':


[('fuck', 0.28), ('suck', 0.04), ('bitch', 0.03), ('faggot', 0.03), ('articl', 0.02), ('ass', 0.02), ('asshol', 0.02), ('gay', 0.02), ('nigger', 0.02), ('shit', 0.02), ('bastard', 0.01), ('cock', 0.01), ('cunt', 0.01), ('dick', 0.01), ('die', 0.01), ('fucker', 0.01), ('go', 0.01), ('idiot', 0.01), ('jew', 0.01), ('kill', 0.01), ('motherfuck', 0.01), ('stupid', 0.01), ('u', 0.01), ("''has''", 0.0), ("'neue", 0.0), ("'strakh'", 0.0), ('\\', 0.0), ('\\\\', 0.0), ('\\\\hore', 0.0), ('\\anamorphism\\', 0.0)]

predict validation data 


  



predict test data 


In [13]:
(val_preds_arr1[:,fld]<val_preds_arr2[:,fld]).sum()/validation_data.shape[0]

0.6638767105088349

In [14]:
test_pred = pipeline.predict(comments_to_score['text'].to_frame())

  


In [15]:
import scipy.stats as stats
test_pred_ranked = stats.rankdata(test_pred, method='ordinal')

In [16]:
submission = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')

In [17]:
submission["score"] = test_pred_ranked

In [18]:
submission

Unnamed: 0,comment_id,score
0,114890,1658
1,732895,1
2,1139051,854
3,1434512,855
4,2084821,5418
...,...,...
7532,504235362,3288
7533,504235566,1954
7534,504308177,1955
7535,504570375,5963


In [19]:
submission.to_csv('./submission.csv', index=False)