# IMPORTS

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, WhitespaceTokenizer, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('toxic_train.csv',index_col = 'id').fillna(' ')
test = pd.read_csv('toxic_test.csv',index_col = 'id').fillna(' ')

In [3]:
print (len(train))
print (len(test))

159571
153164


In [4]:
train.head()

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Separating text from target

In [4]:
#just the comments from train set
traintext = train['comment_text']
testtext = test['comment_text']

In [5]:
#train and test comments
alltext = traintext.append(testtext)

In [6]:
#target labels for training set
target = train.drop('comment_text',axis=1)
target_classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [64]:
target.head()

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000997932d777bf,0,0,0,0,0,0
000103f0d9cfb60f,0,0,0,0,0,0
000113f07ec002fd,0,0,0,0,0,0
0001b41b1c6bb37e,0,0,0,0,0,0
0001d958c54c6e35,0,0,0,0,0,0


## Creating features

In [1]:
#Using the top 20,000 words and top 50,000 character n-gram combinations
wordgrams = TfidfVectorizer(max_features = 20000, stop_words='english', 
                        ngram_range=(1,2),sublinear_tf = True)
chargrams = TfidfVectorizer(max_features = 50000, analyzer ='char',
                            ngram_range = (3,6), stop_words ='english', sublinear_tf = True)

NameError: name 'TfidfVectorizer' is not defined

In [None]:
#I'll be creating dense features, so this is a function to add those to the sparse matrix of TFIDF features I'll have.
def add_feature(X, feature_to_add):
    return hstack([X, csr_matrix(feature_to_add)], 'csr')

In [None]:
traindf = pd.DataFrame(traintext)
testdf = pd.DataFrame(testtext)

In [11]:
tfidfword = wordgrams.fit(alltext)
tfidfchar = chargrams.fit(alltext)

In [54]:
train_words = tfidfword.transform(traintext)
train_chars = tfidfchar.transform(traintext)

test_words = tfidfword.transform(testtext)
test_chars = tfidfchar.transform(testtext)

In [67]:
#Combine both tfidf vectorized blocks
train_cols = hstack([train_words,train_chars])
test_cols = hstack([test_words,test_chars])

### Creating dense features

In [69]:
#tokenize sentence
#traindf['sent_token'] = traindf['comment_text'].apply(lambda row: sent_tokenize(row))

#sentence count
#traindf['num_sentences'] = traindf['sent_token'].apply(lambda row: len(row))

#words,unique words, unique ratio
traindf['words'] = traindf['comment_text'].apply(lambda row: len(list(x for x in row.split())))
traindf['num_unique_words'] = traindf['comment_text'].apply(lambda row: len(set(x for x in row.split())))
traindf['unique_ratio'] = traindf.apply(lambda row: float(row['num_unique_words'])/float(row['words'])
                                        if float(row['words']) != 0 else 0,axis=1)

#sentiment
#traindf['sentiment_polarity'] = traindf['comment_text'].apply(lambda row: TextBlob(row).sentiment.polarity)
#traindf['sentiment_subjectivity'] = traindf['comment_text'].apply(lambda row: TextBlob(row).sentiment.subjectivity)

#total characters, ratio of capital letters
traindf['total_characters'] = traindf['comment_text'].apply(len)
traindf['num_capitals'] = traindf['comment_text'].apply(lambda row: sum(1 for x in row if x.isupper()))
traindf['caps_ratio'] = traindf.apply(lambda row: float(row['num_capitals'])/float(row['total_characters'])
                                      if float(row['total_characters']) != 0 else 0,axis=1)

#exclamation point count
traindf['num_exclamation'] = traindf['comment_text'].apply(lambda row: row.count('!'))
traindf['exclamation_ratio'] = traindf.apply(lambda row: float(row['num_exclamation'])/float(row['total_characters'])
                                             if float(row['total_characters']) != 0 else 0, axis=1)


In [70]:
#tokenize sentence
#testdf['sent_token'] = testdf['comment_text'].apply(lambda row: sent_tokenize(row))

#sentence count
#testdf['num_sentences'] = testdf['sent_token'].apply(lambda row: len(row))

#words,unique words, unique ratio
testdf['words'] = testdf['comment_text'].apply(lambda row: len(list(x for x in row.split())))
testdf['num_unique_words'] = testdf['comment_text'].apply(lambda row: len(set(x for x in row.split())))
testdf['unique_ratio'] = testdf.apply(lambda row: float(row['num_unique_words'])/float(row['words']) 
                                      if float(row['words']) != 0 else 0,axis=1)

#sentiment
#testdf['sentiment_polarity'] = testdf['comment_text'].apply(lambda row: TextBlob(row).sentiment.polarity)
#testdf['sentiment_subjectivity'] = testdf['comment_text'].apply(lambda row: TextBlob(row).sentiment.subjectivity)

#total characters, ratio of capital letters
testdf['total_characters'] = testdf['comment_text'].apply(len)
testdf['num_capitals'] = testdf['comment_text'].apply(lambda row: sum(1 for x in row if x.isupper()))
testdf['caps_ratio'] = testdf.apply(lambda row: float(row['num_capitals'])/float(row['total_characters'])
                                    if float(row['total_characters']) != 0 else 0,axis=1)

#exclamation point count
testdf['num_exclamation'] = testdf['comment_text'].apply(lambda row: row.count('!'))
testdf['exclamation_ratio'] = testdf.apply(lambda row: float(row['num_exclamation'])/float(row['total_characters'])
                                             if float(row['total_characters']) != 0 else 0, axis=1)



In [71]:
#These features didn't end up helping at all
traindf = traindf.drop(['comment_text','total_characters','num_capitals', 'words'
                        ,'num_unique_words', 'num_exclamation'],axis=1)
testdf = testdf.drop(['comment_text','total_characters','num_capitals', 'words'
                      , 'num_unique_words', 'num_exclamation'],axis=1)

In [82]:
traindf.head()

Unnamed: 0_level_0,unique_ratio,caps_ratio,exclamation_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0000997932d777bf,0.953488,0.064394,0.0
000103f0d9cfb60f,1.0,0.071429,0.008929
000113f07ec002fd,0.928571,0.017167,0.0
0001b41b1c6bb37e,0.725664,0.017685,0.0
0001d958c54c6e35,1.0,0.029851,0.0


In [81]:
#Scale dense features to work better in the sparse matrix
std = StandardScaler()
X_scale = pd.DataFrame(std.fit_transform(traindf))
y_scale = pd.DataFrame(std.fit_transform(testdf))

In [74]:
train_cols = add_feature(train_cols, X_scale)

test_cols = add_feature(test_cols, y_scale)

In [75]:
submission = pd.DataFrame(test.index)

In [76]:
for target_class in target_classes:
    train_target = target[target_class]
    classifier = LogisticRegression()

    cv_score = np.mean(cross_val_score(classifier, train_cols, train_target, cv=3, scoring='roc_auc'))

    print('CV score for class {} is {}'.format(target_class, cv_score))
    classifier.fit(train_cols, train_target)
    submission[target_class] = classifier.predict_proba(test_cols)[:, 1]


CV score for class toxic is 0.9783967817982372
CV score for class severe_toxic is 0.9891921521411245
CV score for class obscene is 0.9894562097238233
CV score for class threat is 0.9892294648709684
CV score for class insult is 0.9822893198906785
CV score for class identity_hate is 0.9821107203908402


In [77]:
submission.to_csv('submission3f.csv', index=False)