In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

train_df['comment_text'] = train_df['comment_text'].str.replace('[^a-zA-Z0-9]', ' ')
test_df['comment_text'] =test_df['comment_text'].str.replace('[^a-zA-Z0-9]', ' ')

## Number of words in the comment_text ##
train_df["num_words"] = train_df["comment_text"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["comment_text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the comment_text ##
train_df["num_unique_words"] = train_df["comment_text"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["comment_text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the comment_text ##
train_df["num_chars"] = train_df["comment_text"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["comment_text"].apply(lambda x: len(str(x)))

## asterix_freq
train_df["asterix_freq"] = train_df["comment_text"].apply(lambda x: x.count('!')/len(x))
test_df["asterix_freq"] = test_df["comment_text"].apply(lambda x: x.count('!')/len(x))

## Number of stopwords in the comment_text ##
eng_stopwords = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"]
train_df["num_stopwords"] = train_df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test_df["num_stopwords"] = test_df["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the comment_text ##
import string
train_df["num_punctuations"] =train_df['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['comment_text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the comment_text ##
train_df["num_words_upper"] = train_df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the comment_text ##
train_df["num_words_title"] = train_df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the comment_text ##
train_df["mean_word_len"] = train_df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

## Average length of the words in the comment_text ##
train_df["mean_word_len"] = train_df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

print('done')

  out=out, **kwargs)


done


In [4]:
# https://www.kaggle.com/emotionevil/general-oof-class-for-stacking
# Thanks olivier for his text clean  
# https://www.kaggle.com/ogrellier/lgbm-with-words-and-chars-n-gram?scriptVersionId=2694282
import string
import re

# Contraction replacement patterns
cont_patterns = [
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
    (b'&lt;3', b' heart '),
    (b':d', b' smile '),
    (b':dd', b' smile '),
    (b':p', b' smile '),
    (b'8\)', b' smile '),
    (b':-\)', b' smile '),
    (b':\)', b' smile '),
    (b';\)', b' smile '),
    (b'\(-:', b' smile '),
    (b'\(:', b' smile '),
    (b'yay!', b' good '),
    (b'yay', b' good '),
    (b'yaay', b' good '),
    (b':/', b' worry '),
    (b':&gt;', b' angry '),
    (b":'\)", b' sad '),
    (b':-\(', b' sad '),
    (b':\(', b' sad '),
    (b':s', b' sad '),
    (b':-s', b' sad '),
    (b'\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', b' '),
    (b'(\[[\s\S]*\])', b' '),
    (b'[\s]*?(www.[\S]*)', b' ')
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]



def prepare_for_char_n_gram(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    
    # replace words like hhhhhhhhhhhhhhi with hi
    for ch in string.ascii_lowercase:
        pattern = bytes(ch+'{3,}', encoding="utf-8")
        clean = re.sub(pattern, bytes(ch, encoding="utf-8"), clean)
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    clean = re.sub(b" ", b"# #", clean)  # Replace space
    clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')


def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    if len(text) == 0:
        return 0
    else:
        return len(re.findall(regexp, text)) / len(text)


def get_indicators_and_clean_comments(df):
    """
    Check all sorts of content as it may help find toxic comment
    Though I'm not sure all of them improve scores
    """
    # Count number of \n
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    # Get length in words and characters
    # df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split()))
    # df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x))
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
    # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))

    # Now clean comments
    df["clean_comment"] = df["comment_text"].apply(lambda x: prepare_for_char_n_gram(x))

    # Get the new length in words and characters
    df["clean_word_len"] = df["clean_comment"].apply(lambda x: len(x.split()))
    df["clean_char_len"] = df["clean_comment"].apply(lambda x: len(x))
    # Number of different characters used in a comment
    # Using the f word only will reduce the number of letters required in the comment
    df["clean_chars"] = df["clean_comment"].apply(lambda x: len(set(x)))
    df["clean_chars_ratio"] = df["clean_comment"].apply(lambda x: len(set(x))) / df["clean_comment"].apply(
        lambda x: 1 + min(99, len(x)))

# add features
get_indicators_and_clean_comments(train_df)
print('train feat done')
get_indicators_and_clean_comments(test_df)
print('test feat done')

train feat done
test feat done


In [5]:
# add features
def add_feat(df):
    df['unique_r'] = df['num_unique_words'] / df['num_words']
    df['w_p'] = df['num_words'] - df['num_punctuations']
    df['w_p_r'] = df['w_p'] / df['num_words']
    df['stop_r'] = df['num_stopwords'] / df['num_words']
    df['w_p_stop'] = df['w_p'] - df['num_stopwords']
    df['w_p_stop_r'] = df['w_p_stop'] / df['num_words']
    df['num_words_upper_r'] = df['num_words_upper'] / df['num_words']
    df['num_words_title_r'] = df['num_words_title'] / df['num_words']

add_feat(train_df)
add_feat(test_df)
print(train_df.columns)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate', 'num_words', 'num_unique_words', 'num_chars',
       'asterix_freq', 'num_stopwords', 'num_punctuations', 'num_words_upper',
       'num_words_title', 'mean_word_len', 'ant_slash_n', 'nb_upper', 'nb_fk',
       'nb_sk', 'nb_dk', 'nb_you', 'nb_mother', 'nb_ng', 'start_with_columns',
       'has_timestamp', 'has_date_long', 'has_date_short', 'has_http',
       'has_mail', 'has_emphasize_equal', 'has_emphasize_quotes',
       'clean_comment', 'clean_word_len', 'clean_char_len', 'clean_chars',
       'clean_chars_ratio', 'unique_r', 'w_p', 'w_p_r', 'stop_r', 'w_p_stop',
       'w_p_stop_r', 'num_words_upper_r', 'num_words_title_r'],
      dtype='object')


In [6]:
train_df=train_df.drop(['id','comment_text','toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate','clean_comment'],axis=1)
print(train_df.head(5))

   num_words  num_unique_words  num_chars  asterix_freq  num_stopwords  \
0         50                46        264           0.0             20   
1         20                20        112           0.0              4   
2         44                41        233           0.0             21   
3        116                84        622           0.0             70   
4         14                14         67           0.0              8   

   num_punctuations  num_words_upper  num_words_title  mean_word_len  \
0                 0                3               12       4.240000   
1                 0                3                5       4.150000   
2                 0                1                4       4.227273   
3                 0                5               11       4.189655   
4                 0                0                2       3.571429   

   ant_slash_n        ...          clean_chars  clean_chars_ratio  unique_r  \
0          0.0        ...                  

In [7]:
test_df = test_df.drop(['id','comment_text','clean_comment'],axis=1)
print(test_df.head(5))

   num_words  num_unique_words  num_chars  asterix_freq  num_stopwords  \
0         75                63        367           0.0             34   
1         10                 9         50           0.0              6   
2          5                 5         54           0.0              1   
3         39                29        205           0.0             23   
4          8                 8         41           0.0              3   

   num_punctuations  num_words_upper  num_words_title  mean_word_len  \
0                 0                0                4       3.786667   
1                 0                1                2       3.000000   
2                 0                0                4       5.200000   
3                 0                3                4       4.153846   
4                 0                1                1       4.125000   

   ant_slash_n        ...          clean_chars  clean_chars_ratio  unique_r  \
0          0.0        ...                  

In [8]:
import pickle
with open('../features/other_feat.pkl','wb') as fout:
    pickle.dump([train_df.values,test_df.values],fout)
print('done')

done
