In [38]:
#import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [39]:
train_df = pd.read_csv('big_train.csv')
test_df = pd.read_csv('test_cleaned.csv')

In [None]:
train_df.sample(5)

In [40]:
cols_target = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']

In [None]:
# check missing values in numeric columns
train_df.describe()

In [None]:
test_df.head()

In [41]:
# Let's look at the character length for the rows in the training data and record these
#train_df['char_length'] = train_df['comment_text'].apply(lambda x: len(str(x)))
# look at the histogram plot for text length
#sns.set()
#data = train_df[cols_target]
#test_df['char_length'] = test_df['comment_text'].apply(lambda x: len(str(x)))

In [50]:
# clean the comment_text in train_df
cleaned_train_comment = []
for i in range(0,len(train_df)):
   # cleaned_comment = clean_text(train_df['comment_text'][i])
    cleaned_train_comment.append(train_df['comment_text'][i])
train_df['comment_text'] = pd.Series(cleaned_train_comment).astype(str)

In [43]:
# clean the comment_text in test_df
cleaned_test_comment = []
for i in range(0,len(test_df)):
   # cleaned_comment = clean_text(test_df['comment_text'][i])
    cleaned_test_comment.append(train_df['comment_text'][i])
test_df['comment_text'] = pd.Series(cleaned_test_comment).astype(str)

In [44]:
#train_df = train_df.drop('char_length',axis=1)

In [45]:
X = train_df.comment_text
test_X = test_df.comment_text
print(X.shape, test_X.shape)

(292050,) (153164,)


In [46]:
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=20000,stop_words='english')
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [47]:
# learn the vocabulary in the training data, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
# examine the document-term matrix created from X_train
X_dtm

<292050x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 4724591 stored elements in Compressed Sparse Row format>

In [48]:
# transform the test data using the earlier fitted vocabulary, into a document-term matrix
test_X_dtm = vect.transform(test_X)
# examine the document-term matrix from X_test
test_X_dtm

<153164x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2476128 stored elements in Compressed Sparse Row format>

In [49]:
# import and instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
logreg = LogisticRegression(C=1.0)

# create submission file
submission_binary = pd.read_csv('sample_submission.csv')

for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm, y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training accuracy is {}'.format(accuracy_score(y, y_pred_X)))
    # compute the predicted probabilities for X_test_dtm
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    submission_binary[label] = test_y_prob

... Processing obscene
Training accuracy is 0.9340318438623524
... Processing insult
Training accuracy is 0.8995103578154425
... Processing toxic
Training accuracy is 0.9136414997431946
... Processing severe_toxic
Training accuracy is 0.9618763910289334
... Processing identity_hate
Training accuracy is 0.9729429892141757
... Processing threat
Training accuracy is 0.991241225817497


In [33]:
submission_binary.head()
# generate submission file
submission_binary.to_csv('submission_binary.csv',index=False)

In [34]:
# create submission file
submission_chains = pd.read_csv('sample_submission.csv')
# create a function to add features
def add_feature(X, feature_to_add):
    '''
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    '''
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [35]:
for label in cols_target:
    print('... Processing {}'.format(label))
    y = train_df[label]
    # train the model using X_dtm & y
    logreg.fit(X_dtm,y)
    # compute the training accuracy
    y_pred_X = logreg.predict(X_dtm)
    print('Training Accuracy is {}'.format(accuracy_score(y,y_pred_X)))
    # make predictions from test_X
    test_y = logreg.predict(test_X_dtm)
    test_y_prob = logreg.predict_proba(test_X_dtm)[:,1]
    submission_chains[label] = test_y_prob
    # chain current label to X_dtm
    X_dtm = add_feature(X_dtm, y)
    print('Shape of X_dtm is now {}'.format(X_dtm.shape))
    # chain current label predictions to test_X_dtm
    test_X_dtm = add_feature(test_X_dtm, test_y)
    print('Shape of test_X_dtm is now {}'.format(test_X_dtm.shape))

... Processing obscene
Training Accuracy is 0.9429857901044342
Shape of X_dtm is now (292050, 5001)
Shape of test_X_dtm is now (153164, 5001)
... Processing insult
Training Accuracy is 0.9288751926040062
Shape of X_dtm is now (292050, 5002)
Shape of test_X_dtm is now (153164, 5002)
... Processing toxic
Training Accuracy is 0.9192021914055812
Shape of X_dtm is now (292050, 5003)
Shape of test_X_dtm is now (153164, 5003)
... Processing severe_toxic
Training Accuracy is 0.9764252696456086
Shape of X_dtm is now (292050, 5004)
Shape of test_X_dtm is now (153164, 5004)
... Processing identity_hate
Training Accuracy is 0.9897003937681904
Shape of X_dtm is now (292050, 5005)
Shape of test_X_dtm is now (153164, 5005)
... Processing threat
Training Accuracy is 0.9977880499914398
Shape of X_dtm is now (292050, 5006)
Shape of test_X_dtm is now (153164, 5006)


In [68]:
# generate submission file
submission_chains.to_csv('submission_chains.csv', index=False)

In [69]:
# create submission file
submission_combined = pd.read_csv('sample_submission.csv')

In [70]:
# corr_targets = ['obscene','insult','toxic']
for label in cols_target:
    submission_combined[label] = 0.5*(submission_chains[label]+submission_binary[label])

In [71]:
submission_combined.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999348,0.174231,0.997642,0.05852,0.775079,0.585482
1,0000247867823ef7,0.007267,0.000725,0.002726,0.000912,0.012521,0.002102
2,00013b17ad220c46,0.056599,0.00024,0.011013,0.000134,0.039543,0.006745
3,00017563c3f7919a,0.009609,0.012903,0.01713,0.000644,0.01527,0.000121
4,00017695ad8997eb,0.245206,0.001235,0.010774,0.001696,0.015898,0.000716


In [72]:
# generate submission file
submission_combined.to_csv('submission_combined.csv', index=False)