In [1]:
# libraries import
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import warnings 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from nltk.stem.porter import *

In [2]:
# download lemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/himanshus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore", category=DeprecationWarning) 

%matplotlib inline

In [4]:
# Load data from csv
train  = pd.read_csv('train_E6oV3lV.csv') 
test = pd.read_csv('test_tweets_anuFYb8.csv')

In [5]:
# inspecting non-sexist/racist tweets
train[train['label'] == 0].head(10)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. #run
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. #disapointed #getthanked
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get there. #allshowandnogo
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #hate #imagine #actorslife #revolutionschool #girl
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦
9,10,0,@user @user welcome here ! i'm it's so #gr8 !


In [6]:
# inspecting sexist/racist tweets
train[train['label'] == 1].head(10)

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'build the wall' chant '' #tcot
14,15,1,no comment! in #australia #opkillingbay #seashepherd #helpcovedolphins #thecove #helpcovedolphins
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'd need something like this. again. #neverump #xenophobia
56,57,1,@user lets fight against #love #peace
68,69,1,ð©the white establishment can't have blk folx running around loving themselves and promoting our greatness
77,78,1,"@user hey, white people: you can call people 'white' by @user #race #identity #medâ¦"
82,83,1,how the #altright uses &amp; insecurity to lure men into #whitesupremacy
111,112,1,@user i'm not interested in a #linguistics that doesn't address #race &amp; . racism is about #power. #raciolinguistics bringsâ¦


In [7]:
# Count number of tweets in training and test datasets
print(train.shape)
print(test.shape)

(31962, 3)
(17197, 2)


In [10]:
# Combine test and train dataframe for tweets cleanup
total_tweets = train.append(test, ignore_index=True, sort='False') 
total_tweets.shape

(49159, 3)

In [25]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim
# Bag-of-Words Features
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
bow = bow_vectorizer.fit_transform(total_tweets['tidy_tweet'])
bow.shape

(49159, 1000)

In [26]:
# TF IDF Features
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(total_tweets['tidy_tweet'])
tfidf.shape

(49159, 1000)

In [27]:
# word2vec Feature
# Train a Word2Vec model on our corpus
tokenized_tweet = total_tweets['tidy_tweet'].apply(lambda x: x.split()) # tokenizing 
model_w2v = gensim.models.Word2Vec(
            tokenized_tweet,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 2, # no.of cores
            seed = 34) 
model_w2v.train(tokenized_tweet, total_examples= len(total_tweets['tidy_tweet']), epochs=20)

(7942505, 9627200)

In [28]:

# function to create a vector for each tweet by taking the average of the vectors of the words present in the tweet
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

wordvec_arrays = np.zeros((len(tokenized_tweet), 200)) 
for i in range(len(tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(tokenized_tweet[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape 

(49159, 200)

In [29]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def cv_score(clf, x, y, score_func=f1_score):
    result = 0
    nfold = 10
    for train, test in KFold(nfold).split(x): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        prediction = clf.predict_proba(x[test])
        prediction_int = prediction[:,1] >= 0.3
        prediction_int = prediction_int.astype(np.int)
        result += score_func(prediction_int, y[test]) # evaluate score function on held-out data
    return result / nfold # average

In [63]:
### Logistic Regression on 
# 1.Bag-of-Words Features

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss, roc_curve, roc_auc_score

# Extracting train and test BoW features
train_bow = bow[:31962,:]
test_bow = bow[31962:,:]
# splitting data into training and validation set
## using k-fold cross validation 
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(train_bow, train['label'], random_state=42, test_size=0.3)
logisticRegression=LogisticRegression()
#training the model
logisticRegression.fit(xtrain_bow, ytrain)
prediction = logisticRegression.predict_proba(xvalid_bow) # predicting on the validation set 

prediction_int = prediction[:,1] >=0.31
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print("Log Loss: {0}".format(log_loss(yvalid, prediction)))
# fpr, tpr, thresholds =roc_curve(yvalid, prediction_int)
# optimal_idx = np.argmax(tpr - fpr)
# optimal_threshold = thresholds[optimal_idx]
print(roc_auc_score(yvalid, prediction_int))
print(cv_score(logisticRegression,train_bow, train['label']))



F1 score: 0.5588723051409618
Log Loss: 0.15494638768813623
0.7359576064435841
0.5690260594628616


In [66]:
# 2.Tf-Idf Features
train_tfidf = tfidf[:31962,:]
test_tfidf = tfidf[31962:,:] 
xtrain_tfidf = train_tfidf[ytrain.index]
xvalid_tfidf = train_tfidf[yvalid.index]
# training the model
logisticRegression.fit(xtrain_tfidf, ytrain) 
prediction = logisticRegression.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int) 
print("F1 score: {0}".format(f1_score(yvalid, prediction_int))) # calculating f1 score for the validation set
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.5456110154905336
0.7226852809545856




0.1550543820241176

In [67]:
# 3 Word2Vec Features
# Extracting train and test Word2Vec features
train_w2v = wordvec_df.iloc[:31962,:]
test_w2v = wordvec_df.iloc[31962:,:] 
xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]
# training the model
logisticRegression.fit(xtrain_w2v, ytrain) 
prediction = logisticRegression.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int))) # calculating f1 score for the validation set
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)



F1 score: 0.6385542168674698
0.7975889095750793


0.13340676262273918

In [68]:
### Support Vector Machine(SVM) on
from sklearn import svm

# 1.Bag-of-Words Features
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_bow, ytrain)
prediction = svc.predict_proba(xvalid_bow)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.5224416517055656
0.704914694090645


0.16703275889881386

In [69]:
# 2.Tf-Idf Features
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_tfidf, ytrain)
prediction = svc.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.5319148936170213
0.7112129003024125


0.16567175652622

In [65]:
# 3 Word2Vec Features
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_w2v, ytrain)
prediction = svc.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >= 0.3
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.6282151208106002
0.7835855899340339


0.1363408981938195

In [78]:
### Random Forest on
from sklearn.ensemble import RandomForestClassifier

# 1.Bag-of-Words Features
randomForestClassifier = RandomForestClassifier(n_estimators=400, random_state=11)
randomForestClassifier.fit(xtrain_bow, ytrain)
prediction = randomForestClassifier.predict(xvalid_bow)
print("F1 score: {0}".format(f1_score(yvalid, prediction)))
print(roc_auc_score(yvalid, prediction))
log_loss(yvalid, prediction)

F1 score: 0.5407523510971787
0.7383243693174543


2.110743609133687

In [71]:
# 2.Tf-Idf Features
randomForestClassifier = RandomForestClassifier(n_estimators=400, random_state=11)
randomForestClassifier.fit(xtrain_tfidf, ytrain)
prediction = randomForestClassifier.predict(xvalid_tfidf)
print("F1 score: {0}".format(f1_score(yvalid, prediction)))
print(roc_auc_score(yvalid, prediction))
log_loss(yvalid, prediction)

F1 score: 0.5895316804407713
0.7299326713752377


1.6100636369495482

In [72]:
# 3 Word2Vec Features
randomForestClassifier = RandomForestClassifier(n_estimators=400, random_state=11)
randomForestClassifier.fit(xtrain_w2v, ytrain)
prediction = randomForestClassifier.predict(xvalid_w2v)
print("F1 score: {0}".format(f1_score(yvalid, prediction)))
print(roc_auc_score(yvalid, prediction))
log_loss(yvalid, prediction)

F1 score: 0.477124183006536
0.6592454958282848


1.7289211245717577

In [82]:
### Extreme Gradient Boosting (xgboost) on
from xgboost import XGBClassifier

# 1.Bag-of-Words Features
xgbModel = XGBClassifier(max_depth=6, n_estimators=1000)
xgbModel.fit(xtrain_bow, ytrain)
prediction = xgbModel.predict_proba(xvalid_bow)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.5763546798029556
0.7463038210349004


0.15139352009404256

In [81]:
# 2.Tf-Idf Features
xgbModel = XGBClassifier(max_depth=6, n_estimators=1000)
xgbModel.fit(xtrain_tfidf, ytrain)
prediction = xgbModel.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.5804365400161682
0.751534143706637


0.1542417797886292

In [80]:
# 3 Word2Vec Features
xgbModel = XGBClassifier(max_depth=6, n_estimators=1000)
xgbModel.fit(xtrain_w2v, ytrain)
prediction = xgbModel.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)
print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

F1 score: 0.6769759450171822
0.7831829480119914


0.16887477852968383

In [128]:
from catboost import CatBoostClassifier
model=CatBoostClassifier(iterations=15,eval_metric="F1")
model.fit(xtrain_w2v, ytrain,eval_set=(xvalid_w2v,yvalid),plot=False)
prediction = model.predict_proba(xvalid_w2v)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)

print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

Learning rate set to 0.5
0:	learn: 0.0012829	test: 0.0000000	best: 0.0000000 (0)	total: 56.7ms	remaining: 794ms
1:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 107ms	remaining: 696ms
2:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 158ms	remaining: 632ms
3:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 217ms	remaining: 595ms
4:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 274ms	remaining: 548ms
5:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 326ms	remaining: 489ms
6:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 373ms	remaining: 426ms
7:	learn: 0.0025641	test: 0.0000000	best: 0.0000000 (0)	total: 420ms	remaining: 368ms
8:	learn: 0.0025641	test: 0.0000000	best: 0.0000000 (0)	total: 488ms	remaining: 326ms
9:	learn: 0.0051216	test: 0.0000000	best: 0.0000000 (0)	total: 551ms	remaining: 275ms
10:	learn: 0.0076726	test: 0.0000000	best: 0.0000000 (0)	total: 615ms	remaining: 224ms
11:	learn: 0.0102171	test: 

0.33407452676557037

In [None]:
# unique, counts = np.unique(prediction_int, return_counts=True)

# print(dict(zip(unique, counts)))
# unique, counts = np.unique(yvalid, return_counts=True)
# print(dict(zip(unique, counts)))

In [129]:
model=CatBoostClassifier(iterations=10,eval_metric="F1")
model.fit(xtrain_tfidf, ytrain,eval_set=(xvalid_tfidf,yvalid),plot=False)
prediction = model.predict_proba(xvalid_tfidf)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)

print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

Learning rate set to 0.5
0:	learn: 0.0025625	test: 0.0000000	best: 0.0000000 (0)	total: 59.2ms	remaining: 533ms
1:	learn: 0.2222222	test: 0.2154626	best: 0.2154626 (1)	total: 116ms	remaining: 463ms
2:	learn: 0.2442833	test: 0.2323232	best: 0.2323232 (2)	total: 179ms	remaining: 418ms
3:	learn: 0.2734205	test: 0.2607626	best: 0.2607626 (3)	total: 237ms	remaining: 356ms
4:	learn: 0.3532441	test: 0.3240741	best: 0.3240741 (4)	total: 298ms	remaining: 298ms
5:	learn: 0.3891675	test: 0.3548023	best: 0.3548023 (5)	total: 355ms	remaining: 237ms
6:	learn: 0.4086871	test: 0.3701226	best: 0.3701226 (6)	total: 422ms	remaining: 181ms
7:	learn: 0.4019802	test: 0.3741648	best: 0.3741648 (7)	total: 480ms	remaining: 120ms
8:	learn: 0.4375605	test: 0.3817980	best: 0.3817980 (8)	total: 535ms	remaining: 59.5ms
9:	learn: 0.4541109	test: 0.4034896	best: 0.4034896 (9)	total: 596ms	remaining: 0us

bestTest = 0.4034896401
bestIteration = 9

F1 score: 0.4816247582205029
0.6763465724952472


0.1876043921987411

In [130]:
model=CatBoostClassifier(iterations=15,eval_metric="F1")
model.fit(xtrain_bow, ytrain,eval_set=(xvalid_bow,yvalid),plot=False)
prediction = model.predict_proba(xvalid_bow)
prediction_int = prediction[:,1] >= 0.31
prediction_int = prediction_int.astype(np.int)

print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
print(roc_auc_score(yvalid, prediction_int))
log_loss(yvalid, prediction)

Learning rate set to 0.5
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 16.2ms	remaining: 227ms
1:	learn: 0.2149321	test: 0.2089172	best: 0.2089172 (1)	total: 32.9ms	remaining: 214ms
2:	learn: 0.2377232	test: 0.2386935	best: 0.2386935 (2)	total: 49.8ms	remaining: 199ms
3:	learn: 0.3139535	test: 0.3201856	best: 0.3201856 (3)	total: 65.7ms	remaining: 181ms
4:	learn: 0.3491736	test: 0.3189756	best: 0.3201856 (3)	total: 82.2ms	remaining: 164ms
5:	learn: 0.3783510	test: 0.3321879	best: 0.3321879 (5)	total: 97.7ms	remaining: 147ms
6:	learn: 0.4047737	test: 0.3394077	best: 0.3394077 (6)	total: 113ms	remaining: 130ms
7:	learn: 0.4362579	test: 0.3692992	best: 0.3692992 (7)	total: 130ms	remaining: 114ms
8:	learn: 0.4346563	test: 0.3747228	best: 0.3747228 (8)	total: 148ms	remaining: 98.6ms
9:	learn: 0.4521989	test: 0.3824176	best: 0.3824176 (9)	total: 165ms	remaining: 82.6ms
10:	learn: 0.4528122	test: 0.3947655	best: 0.3947655 (10)	total: 185ms	remaining: 67.3ms
11:	learn: 0.46986

0.1812241915983394

In [139]:
### Light Gradient Boosting (lightGBM) on BOW
import lightgbm as lgb

d_train = lgb.Dataset(xtrain_bow.astype('float32'), label=ytrain.astype('float32'))

# hyperparameter for LGBM
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 50
params['min_data'] = 50
params['max_depth'] = 20

clf = lgb.train(params, d_train, 100)

# predictions
prediction = clf.predict(xvalid_bow.astype('float32'))

prediction_int = prediction[:,1] >= 0.31
# prediction_int = prediction_int.astype(np.int)
# print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
# print(roc_auc_score(yvalid, prediction_int))
print(prediction)
log_loss(yvalid, prediction)

IndexError: too many indices for array

In [143]:
### Light Gradient Boosting (lightGBM) on word2vec

d_train = lgb.Dataset(xtrain_w2v.astype('float32'), label=ytrain.astype('float32'))

# hyperparameter for LGBM
params = {}
params['learning_rate'] = 0.003
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 50
params['min_data'] = 50
params['max_depth'] = 20

clf = lgb.train(params, d_train, 100)

# predictions
prediction = clf.predict(xvalid_w2v.astype('float32'))

# prediction_int = prediction[:,1] >= 0.31
# prediction_int = prediction_int.astype(np.int)
# print("F1 score: {0}".format(f1_score(yvalid, prediction_int)))
# print(roc_auc_score(yvalid, prediction_int))
print(prediction)
log_loss(yvalid, prediction)

[0.05438901 0.05436548 0.13720644 ... 0.05573447 0.0743086  0.05705106]


0.2110577510365435

In [176]:
# Tuning XGB with word2vec
import xgboost as xgb
dtrain = xgb.DMatrix(xtrain_w2v, label=ytrain) 
dvalid = xgb.DMatrix(xvalid_w2v, label=yvalid)
dtest = xgb.DMatrix(test_w2v)

# Parameter to tune 
params = {
    'objective':'binary:logistic',
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1
 }

In [149]:
# Custom evaluation metric to calculate F1 score.

def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

In [150]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,10)
     for min_child_weight in range(5,8)
 ]
max_f1 = 0. # initializing with 0 
best_params = None 
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
     # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Cross-validation
    cv_results = xgb.cv(        params,
        dtrain,        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )
# Finding best F1 Score
    
mean_f1 = cv_results['test-f1_score-mean'].max()
    
boost_rounds = cv_results['test-f1_score-mean'].argmax()    
print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))    
if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (max_depth,min_child_weight) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with max_depth=6, min_child_weight=5
CV with max_depth=6, min_child_weight=6
CV with max_depth=6, min_child_weight=7
CV with max_depth=7, min_child_weight=5
CV with max_depth=7, min_child_weight=6
CV with max_depth=7, min_child_weight=7
CV with max_depth=8, min_child_weight=5
CV with max_depth=8, min_child_weight=6
CV with max_depth=8, min_child_weight=7
CV with max_depth=9, min_child_weight=5
CV with max_depth=9, min_child_weight=6
CV with max_depth=9, min_child_weight=7
	F1 Score 0.6771971999999999 for 42 rounds
Best params: 9, 7, F1 Score: 0.6771971999999999


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


In [152]:
# Updating max_depth and min_child_weight parameters.

params['max_depth'] = 9 
params['min_child_weight'] = 7

# Tuning subsample and colsample
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5,10)]
    for colsample in [i/10. for i in range(5,10)] ]
max_f1 = 0. 
best_params = None 
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
     # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )
     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with subsample=0.5, colsample=0.5


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


	F1 Score 0.6622511999999999 for 65 rounds
CV with subsample=0.5, colsample=0.6
	F1 Score 0.6622511999999999 for 65 rounds
CV with subsample=0.5, colsample=0.7
	F1 Score 0.6622511999999999 for 65 rounds
CV with subsample=0.5, colsample=0.8
	F1 Score 0.6622511999999999 for 65 rounds
CV with subsample=0.5, colsample=0.9
	F1 Score 0.6622511999999999 for 65 rounds
CV with subsample=0.6, colsample=0.5
	F1 Score 0.6690166000000001 for 61 rounds
CV with subsample=0.6, colsample=0.6
	F1 Score 0.6690166000000001 for 61 rounds
CV with subsample=0.6, colsample=0.7
	F1 Score 0.6690166000000001 for 61 rounds
CV with subsample=0.6, colsample=0.8
	F1 Score 0.6690166000000001 for 61 rounds
CV with subsample=0.6, colsample=0.9
	F1 Score 0.6690166000000001 for 61 rounds
CV with subsample=0.7, colsample=0.5
	F1 Score 0.6812194 for 73 rounds
CV with subsample=0.7, colsample=0.6
	F1 Score 0.6812194 for 73 rounds
CV with subsample=0.7, colsample=0.7
	F1 Score 0.6812194 for 73 rounds
CV with subsample=0.7, c

In [154]:
# Updating subsample and colsample_bytree

params['subsample'] = .7 
params['colsample_bytree'] = .5

# tune the learning rate.

max_f1 = 0. 
best_params = None 
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
     # Update ETA
    params['eta'] = eta

     # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].argmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta 
print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

CV with eta=0.3


The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.


	F1 Score 0.6755394 for 77 rounds
CV with eta=0.2
	F1 Score 0.6821578 for 102 rounds
CV with eta=0.1
	F1 Score 0.6851364 for 117 rounds
CV with eta=0.05
	F1 Score 0.6847057999999999 for 265 rounds
CV with eta=0.01
	F1 Score 0.1302024 for 0 rounds
CV with eta=0.005
	F1 Score 0.1302024 for 0 rounds
Best params: 0.1, F1 Score: 0.6851364


In [184]:
params['eta']=0.1
params['colsample']=0.7
params

{'objective': 'binary:logistic',
 'max_depth': 9,
 'min_child_weight': 7,
 'eta': 0.1,
 'subsample': 0.7,
 'colsample_bytree': 0.5,
 'colsample': 0.7}

In [185]:
## with all the tuned parameters
xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=100
 )



[0]	Validation-error:0.072375	Validation-f1_score:0.133165
Multiple eval metrics have been passed: 'Validation-f1_score' will be used for early stopping.

Will train until Validation-f1_score hasn't improved in 100 rounds.
[1]	Validation-error:0.061946	Validation-f1_score:0.133165
[2]	Validation-error:0.060486	Validation-f1_score:0.133165
[3]	Validation-error:0.058192	Validation-f1_score:0.133165
[4]	Validation-error:0.057983	Validation-f1_score:0.133242
[5]	Validation-error:0.057775	Validation-f1_score:0.353892
[6]	Validation-error:0.0584	Validation-f1_score:0.439125
[7]	Validation-error:0.057253	Validation-f1_score:0.50879
[8]	Validation-error:0.056106	Validation-f1_score:0.538194
[9]	Validation-error:0.05548	Validation-f1_score:0.550816
[10]	Validation-error:0.056106	Validation-f1_score:0.568151
[11]	Validation-error:0.055689	Validation-f1_score:0.567989
[12]	Validation-error:0.054959	Validation-f1_score:0.587382
[13]	Validation-error:0.054542	Validation-f1_score:0.58963
[14]	Valida

[135]	Validation-error:0.043487	Validation-f1_score:0.668842
[136]	Validation-error:0.043592	Validation-f1_score:0.669935
[137]	Validation-error:0.043592	Validation-f1_score:0.669394
[138]	Validation-error:0.043696	Validation-f1_score:0.669388
[139]	Validation-error:0.043592	Validation-f1_score:0.670473
[140]	Validation-error:0.043383	Validation-f1_score:0.668293
[141]	Validation-error:0.043696	Validation-f1_score:0.668293
[142]	Validation-error:0.043383	Validation-f1_score:0.670455
[143]	Validation-error:0.043174	Validation-f1_score:0.668831
[144]	Validation-error:0.04307	Validation-f1_score:0.668836
[145]	Validation-error:0.043174	Validation-f1_score:0.668836
[146]	Validation-error:0.04307	Validation-f1_score:0.670455
[147]	Validation-error:0.042757	Validation-f1_score:0.671533
[148]	Validation-error:0.042653	Validation-f1_score:0.670989
[149]	Validation-error:0.04234	Validation-f1_score:0.669374
[150]	Validation-error:0.042444	Validation-f1_score:0.667747
[151]	Validation-error:0.04

[270]	Validation-error:0.04161	Validation-f1_score:0.666118
[271]	Validation-error:0.041297	Validation-f1_score:0.667763
[272]	Validation-error:0.041402	Validation-f1_score:0.665021
[273]	Validation-error:0.041297	Validation-f1_score:0.663914
[274]	Validation-error:0.041193	Validation-f1_score:0.666117
[275]	Validation-error:0.041402	Validation-f1_score:0.665025
[276]	Validation-error:0.041297	Validation-f1_score:0.667769
[277]	Validation-error:0.041193	Validation-f1_score:0.666667
[278]	Validation-error:0.041402	Validation-f1_score:0.663907
[279]	Validation-error:0.041089	Validation-f1_score:0.666116
[280]	Validation-error:0.041089	Validation-f1_score:0.665017
[281]	Validation-error:0.040984	Validation-f1_score:0.664463
[282]	Validation-error:0.040776	Validation-f1_score:0.663914
[283]	Validation-error:0.04088	Validation-f1_score:0.664468
[284]	Validation-error:0.041089	Validation-f1_score:0.663358
[285]	Validation-error:0.040984	Validation-f1_score:0.665012
[286]	Validation-error:0.0

[405]	Validation-error:0.04088	Validation-f1_score:0.667771
[406]	Validation-error:0.040776	Validation-f1_score:0.668874
[407]	Validation-error:0.040984	Validation-f1_score:0.668874
[408]	Validation-error:0.040463	Validation-f1_score:0.669983
[409]	Validation-error:0.040672	Validation-f1_score:0.669428
[410]	Validation-error:0.040672	Validation-f1_score:0.669428
[411]	Validation-error:0.040672	Validation-f1_score:0.669428
[412]	Validation-error:0.040567	Validation-f1_score:0.669983
[413]	Validation-error:0.040567	Validation-f1_score:0.669983
[414]	Validation-error:0.040463	Validation-f1_score:0.669983
[415]	Validation-error:0.040359	Validation-f1_score:0.669983
[416]	Validation-error:0.040463	Validation-f1_score:0.669983
[417]	Validation-error:0.040254	Validation-f1_score:0.669983
[418]	Validation-error:0.040254	Validation-f1_score:0.670539
[419]	Validation-error:0.039942	Validation-f1_score:0.670539
[420]	Validation-error:0.040046	Validation-f1_score:0.670539
[421]	Validation-error:0.

In [175]:
cv_results['test-f1_score-mean'].max()


0.6851364

In [174]:
test_pred = xgb_model.predict(dtest)
test['label'] = (test_pred >= 0.3).astype(np.int)
submission = test[['id','label']] 
submission.to_csv('xgb_w2v_tuned.csv', index=False)