# Model Catboost

# 1. Imports

## 1.1 Packages

In [1]:
import os
import sys

from collections import defaultdict
from operator import contains
import pandas as pd

from catboost import Pool, cv
from wordcloud import STOPWORDS


from typing import List, Tuple

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Option packages

pd.set_option('display.max_columns', 500)

%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../src/')
from features.build_features import word_count_tweet, unique_word_count_tweet, url_count_tweet, mean_word_length_tweet,\
    char_count_tweet, punctuation_count_tweet, hashtag_count_tweet, mention_count_tweet

## 1.2 Options

In [4]:
path_data = '../data/raw/'

## 1.3 Dataset

In [5]:
df = pd.read_csv(os.path.join(path_data, 'train.csv'), index_col=0)

# 2. Prepare data

In [6]:
df = df.pipe(word_count_tweet)\
    .pipe(unique_word_count_tweet)\
    .pipe(url_count_tweet)\
    .pipe(mean_word_length_tweet)\
    .pipe(char_count_tweet)\
    .pipe(punctuation_count_tweet)\
    .pipe(hashtag_count_tweet)\
    .pipe(mention_count_tweet)

df['keyword'] = df.keyword.fillna('uknwn')
df['location'] = df.location.fillna('uknwn')

In [7]:
df.sample(5)

Unnamed: 0_level_0,keyword,location,text,target,word_count,unique_word_count,url_count,mean_word_length,char_count,punctuation_count,hashtag_count,mention_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5758,forest%20fires,uknwn,I'm about to cook your Smokey the Bear saving ...,0,14,14,1,5.571429,92,6,0,0
7932,rainstorm,"Memphis, TN",If you can't have the roar of the waves a rain...,0,15,14,1,5.533333,97,9,0,0
3833,detonate,back in japan ??????????,Detonate (feat. M?.?O?.?P?.?)\nfrom Grandeur b...,0,9,9,1,8.222222,82,17,0,0
6420,hurricane,uknwn,@pattonoswalt @FoxNews Wait I thought Fecal Hu...,1,22,21,0,5.227273,136,6,0,2
10355,weapons,uknwn,@NRO Except when ordered not to carry unauthor...,1,20,20,0,4.9,117,2,0,1


In [8]:
X_train = df[list(set(df.columns) - set('target'))]
y_train = df.target

# 3. Model CatBoost

In [9]:
feat_cat = [
    'location', 'keyword', 'text',
]

In [11]:
params = {
    'loss_function': 'Logloss',
    'iterations': 100,
}

train_pool = Pool(data=X_train, label=y_train, cat_features=feat_cat, has_header=True)

cv_data = cv(
    params=params,
    pool=train_pool,
    fold_count=5,
    shuffle=True,
    partition_random_seed=42,
    stratified=True,
    plot=True,
    verbose=20,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 0.5895329	test: 0.5898311	best: 0.5898311 (0)	total: 213ms	remaining: 3m 33s
100:	learn: 0.0004649	test: 0.0004581	best: 0.0004581 (100)	total: 5.65s	remaining: 50.3s
200:	learn: 0.0003269	test: 0.0003216	best: 0.0003216 (199)	total: 9.93s	remaining: 39.5s
300:	learn: 0.0003113	test: 0.0003063	best: 0.0003063 (300)	total: 13.8s	remaining: 32s
400:	learn: 0.0003111	test: 0.0003060	best: 0.0003060 (400)	total: 17.8s	remaining: 26.6s
500:	learn: 0.0003109	test: 0.0003058	best: 0.0003058 (500)	total: 21.5s	remaining: 21.4s
600:	learn: 0.0003107	test: 0.0003056	best: 0.0003056 (598)	total: 25.2s	remaining: 16.8s
700:	learn: 0.0003105	test: 0.0003055	best: 0.0003055 (700)	total: 29.1s	remaining: 12.4s
800:	learn: 0.0003102	test: 0.0003051	best: 0.0003051 (800)	total: 33s	remaining: 8.19s
900:	learn: 0.0003100	test: 0.0003049	best: 0.0003049 (900)	total: 36.8s	remaining: 4.04s
999:	learn: 0.0003099	test: 0.0003048	best: 0.0003048 (997)	total: 40.7s	remaining: 