### Packages

In [46]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import KFold
from wordbatch.models import FTRL
from sklearn.metrics import f1_score, make_scorer

import gc
import os
import re

%matplotlib inline

### Read data

In [75]:
train = pd.read_csv('../data/dataframe/train.csv')
test = pd.read_csv('../data/dataframe/test.csv')

### Feature extraction

In [7]:
def text_cleaner_words(text):
    
    text = re.sub(r'[^a-zâàäçéèêëîïôùûüœ]+', ' ', text.lower())
    return(text)

def text_cleaner_chars(text):
    
    text = re.sub(r'[^a-zâàäçéèêëîïôùûüœ]+', '', text.lower())
    return(text)

#### word extraction

In [8]:
### english
tf_en = TfidfVectorizer(ngram_range=(1,10), max_features=100000, preprocessor=text_cleaner_words)
tf_en.fit(train[train.type == "english"].text.tolist())
col_en = ['english_%s' % c for c in list(tf_en.vocabulary_)]

### french
tf_fr = TfidfVectorizer(ngram_range=(1,10), max_features=100000, preprocessor=text_cleaner_words)
tf_fr.fit(train[train.type == "french"].text.tolist())
col_fr = ['french_%s' % c for c in list(tf_fr.vocabulary_)]

#### chars extraction

In [9]:
### english
tf_en_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner_chars, analyzer='char')
tf_en_char.fit(train[train.type == "english"].text.tolist())
col_char_en = ['englishChar_%s' % c for c in list(tf_en_char.vocabulary_)]

### french
tf_fr_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner_chars, analyzer='char')
tf_fr_char.fit(train[train.type == "french"].text.tolist())
col_char_fr = ['frenchChar_%s' % c for c in list(tf_fr_char.vocabulary_)]

In [None]:
X = csr_matrix(
    hstack(
        [
            tf_en.transform(train.text.tolist()),
            tf_fr.transform(train.text.tolist()),
            tf_en_char.transform(train.text.tolist()),
            tf_fr_char.transform(train.text.tolist())
        ]
    )
)

y = np.array([1 if x == 'english' else 0 for x in train.type.tolist()])

columns = col_en + col_fr + col_char_en + col_char_fr 

Xtest = csr_matrix(
    hstack(
        [
            tf_en.transform(test.text.tolist()),
            tf_fr.transform(test.text.tolist()),
            tf_en_char.transform(test.text.tolist()),
            tf_fr_char.transform(test.text.tolist())
        ]
    )
)

In [54]:
print(X.shape)
gc.collect()

(4692, 200054)


835

### Machine learning

#### FTRL

In [56]:
kf = KFold(n_splits = 10, shuffle=True, random_state = 2701)

scores = list()
for alpha in [0.1, 0.5, 1, 5, 10]:
    for iters in [50, 100, 300, 500]:
        tmp_scores = list()
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index,:], X[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            FTRLModel = FTRL(alpha=alpha, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=iters)
            FTRLModel.fit(X_train, y_train)
    
            tmp_scores.append(f1_score(y_test, np.round(FTRLModel.predict(X_test))))
        scores.append(((alpha, iters), ((np.mean(tmp_scores), np.std(tmp_scores) * 2))))
        print(scores[-1])

((0.1, 50), (0.9041051025806782, 0.021980704003243645))
((0.1, 100), (0.9101761932719276, 0.0265103319567878))
((0.1, 300), (0.913895408201707, 0.02690711668741451))
((0.1, 500), (0.9142595923259125, 0.02634780902059879))
((0.5, 50), (0.8882956101254094, 0.02690646546475696))
((0.5, 100), (0.9106506280914946, 0.02603339671154616))
((0.5, 300), (0.916257959550849, 0.021863913712538274))
((0.5, 500), (0.9192113985572045, 0.02061749818494385))
((1, 50), (0.8547565810124615, 0.041680673778334024))
((1, 100), (0.8978386113746085, 0.029610749742791237))
((1, 300), (0.9173467198812704, 0.020430940279187848))
((1, 500), (0.9189130972447362, 0.021549604851812502))
((5, 50), (0.5290833842725421, 0.0800290068713067))
((5, 100), (0.8339092567232852, 0.04678616765743912))
((5, 300), (0.906687336980846, 0.025337822703594144))
((5, 500), (0.915690466296572, 0.024358714726994874))
((10, 50), (0.24395986795915553, 0.06475691095085098))
((10, 100), (0.7444349808771321, 0.06217895932394041))
((10, 300), 

In [70]:
from itertools import chain
bestElem = pd.DataFrame(list(map(lambda x: list(chain.from_iterable(x)), scores))).sort_values(2, ascending = False).iloc[0,:2].tolist()

In [71]:
FTRLModel = FTRL(alpha=bestElem[0], beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=bestElem[1])
FTRLModel.fit(X, y)

### Predict and save

In [76]:
test['en'] = FTRLModel.predict(Xtest)
test['fr'] = 1 - test['en']

In [77]:
test.sort_values('fr', ascending = False).head()

Unnamed: 0,id,text,en,fr
10302,719dfc6463ddc9f99ed97771cde35a4e,— 68 —\n\nune entente spéciale sur la base du ...,5.917678e-07,0.999999
12133,8e62eb7ec323d3d2499422975752f378,Le port de Gdynia est sans doute susceptible d...,6.418887e-07,0.999999
5854,d65d9fcc2adc4a75fd301021e5573e7b,= @ == ¢\n\n \n \n \n \n \n \n \n\nAprés...,6.929701e-07,0.999999
3247,6b7358314612f951faea67b3d0e67ee3,On cherche actuellement 4 savoir si ces trois ...,6.978659e-07,0.999999
2554,3fb1775ea9ff5ac799e886f2a46f9a4f,\n\n \n\n \n\n \n\nSA ine\n\nM. Cavazzon1 (It...,9.436866e-07,0.999999


In [78]:
test = test[['id', 'en', 'fr']]
test.columns = ['filename', 'en', 'fr']
test['filename'] = list(map(lambda x: x+'.jpg', test.filename.tolist()))

In [79]:
try:
    os.mkdir('../submit')
except: None
    
test.to_csv('../submit/FTRL2709.csv', index = False)