### Packages

In [1]:
import gc
import os
import re

import pandas as pd
import numpy as np 

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import KFold, StratifiedKFold 
from wordbatch.models import FTRL

from sklearn.metrics import f1_score

import matplotlib.pyplot as plt

%matplotlib inline

### Read data

In [2]:
train = pd.read_csv('../data/dataframe/train.csv')
test = pd.read_csv('../data/dataframe/test.csv')

### Feature extraction

In [7]:
def text_cleaner(text):
    text = re.sub(r'[^a-zâàäçéèêëîïôùûüœ]+', ' ', text.lower())
    text = re.sub(r'\n', ' ', text)
    text = re.sub(' +', ' ', text)
    text = re.sub(r"^\s+", "", text, flags=re.UNICODE)
    text = re.sub(r"\s+$", "", text, flags=re.UNICODE)
    return(text)

#### word extraction

In [8]:
### english
tf_en = TfidfVectorizer(ngram_range=(1,10), max_features=80000, preprocessor=text_cleaner, norm='l2', sublinear_tf=True)
tf_en.fit(train[train.type == "english"].text.tolist())
col_en = ['english_%s' % c for c in list(tf_en.vocabulary_)]

### french
tf_fr = TfidfVectorizer(ngram_range=(1,10), max_features=80000, preprocessor=text_cleaner, norm='l2', sublinear_tf=True)
tf_fr.fit(train[train.type == "french"].text.tolist())
col_fr = ['french_%s' % c for c in list(tf_fr.vocabulary_)]

In [9]:
### english
tf_en_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner, analyzer='char')
tf_en_char.fit(train[train.type == "english"].text.tolist())
col_char_en = ['englishChar_%s' % c for c in list(tf_en_char.vocabulary_)]

### french
tf_fr_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner, analyzer='char')
tf_fr_char.fit(train[train.type == "french"].text.tolist())
col_char_fr = ['frenchChar_%s' % c for c in list(tf_fr_char.vocabulary_)]

In [10]:
X = csr_matrix(
    hstack(
        [
            tf_en.transform(train.text.tolist()),
            tf_fr.transform(train.text.tolist()),
            tf_en_char.transform(train.text.tolist()),
            tf_fr_char.transform(train.text.tolist())
        ]
    )
)

y = np.array([1 if x == 'english' else 0 for x in train.type.tolist()])

columns = col_en + col_fr + col_char_en + col_char_fr #+ trainMetaText.columns.tolist()

Xtest = csr_matrix(
    hstack(
        [
            tf_en.transform(test.text.tolist()),
            tf_fr.transform(test.text.tolist()),
            tf_en_char.transform(test.text.tolist()),
            tf_fr_char.transform(test.text.tolist())
        ]
    )
)

In [11]:
del tf_en, tf_fr, tf_en_char, tf_fr_char; gc.collect()

4

### Machine learning

### FTRL

#### Training

In [84]:
kf = StratifiedKFold(n_splits = 10, shuffle=True, random_state = 2701)
ftrlPred = list()

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y[train_index], y[test_index]

    FTRLModel = FTRL(alpha=0.60,
                     beta=1.0,
                     L1=0.00001,
                     L2=1.0,
                     D=2 ** 25,
                     iters=950)
    FTRLModel.fit(X_train, y_train)

    ftrlPred.append(FTRLModel.predict(Xtest))
    validPred = FTRLModel.predict(X_test)
    print(f1_score(y_test, np.round(validPred)))

    del FTRLModel; gc.collect()

0.926530612244898
0.9056603773584906
0.9117043121149897
0.9128630705394191
0.9300411522633746
0.918580375782881
0.9090909090909092
0.9224318658280922
0.9221052631578948
0.9110169491525425


In [85]:
ftrlPredEnglish = pd.DataFrame(ftrlPred).T.mean(1)

In [98]:
ftrlTest = test.copy()
ftrlTest['en'] = ftrlPredEnglish
ftrlTest['fr'] = 1 - ftrlTest['en']
ftrlTest.drop(['text'], 1, inplace=True)
ftrlTest = ftrlTest.rename(columns={'id':'filename'})
ftrlTest['filename'] = [x+'.jpg' for x in ftrlTest['filename'].tolist()]
ftrlTest.to_csv('../submit/FTRL.csv', index = False)

In [99]:
ftrlTest.head()

Unnamed: 0,filename,en,fr
0,9a03e71410809857e19dea363daee945.jpg,0.999943,5.7e-05
1,8f0e95610aa9880767a04888fab11ebb.jpg,0.003512,0.996488
2,d423882dfd1db8b145a1d2ce0663267d.jpg,0.999985,1.5e-05
3,d52eebffd648db43a20d48c0921394db.jpg,7.2e-05,0.999928
4,318d0ddd145e53e496d0660dd6f0d2d3.jpg,0.999983,1.7e-05
