### Packages

In [22]:
import pandas as pd
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import KFold
from wordbatch.models import FTRL
from sklearn.metrics import f1_score, make_scorer

import gc
import os
import re

%matplotlib inline

### Read data

In [6]:
train = pd.read_csv('../data/dataframe/train.csv')
test = pd.read_csv('../data/dataframe/test.csv')

### Feature extraction

In [7]:
def text_cleaner_words(text):
    
    text = re.sub(r'[^a-zâàäçéèêëîïôùûüœ]+', ' ', text.lower())
    return(text)

def text_cleaner_chars(text):
    
    text = re.sub(r'[^a-zâàäçéèêëîïôùûüœ]+', '', text.lower())
    return(text)

#### word extraction

In [8]:
### english
tf_en = TfidfVectorizer(ngram_range=(1,10), max_features=100000, preprocessor=text_cleaner_words)
tf_en.fit(train[train.type == "english"].text.tolist())
col_en = ['english_%s' % c for c in list(tf_en.vocabulary_)]

### french
tf_fr = TfidfVectorizer(ngram_range=(1,10), max_features=100000, preprocessor=text_cleaner_words)
tf_fr.fit(train[train.type == "french"].text.tolist())
col_fr = ['french_%s' % c for c in list(tf_fr.vocabulary_)]

#### chars extraction

In [9]:
### english
tf_en_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner_chars, analyzer='char')
tf_en_char.fit(train[train.type == "english"].text.tolist())
col_char_en = ['englishChar_%s' % c for c in list(tf_en_char.vocabulary_)]

### french
tf_fr_char = TfidfVectorizer(ngram_range=(1,1), max_features=100, preprocessor=text_cleaner_chars, analyzer='char')
tf_fr_char.fit(train[train.type == "french"].text.tolist())
col_char_fr = ['frenchChar_%s' % c for c in list(tf_fr_char.vocabulary_)]

In [23]:
X = csr_matrix(
    hstack(
        [
            tf_en.transform(train.text.tolist()),
            tf_fr.transform(train.text.tolist()),
            tf_en_char.transform(train.text.tolist()),
            tf_fr_char.transform(train.text.tolist())
        ]
    )
)

y = np.array([1 if x == 'english' else 0 for x in train.type.tolist()])

columns = col_en + col_fr + col_char_en + col_char_fr 

Xtest = csr_matrix(
    hstack(
        [
            tf_en.transform(test.text.tolist()),
            tf_fr.transform(test.text.tolist()),
            tf_en_char.transform(test.text.tolist()),
            tf_fr_char.transform(test.text.tolist())
        ]
    )
)

In [24]:
print(X.shape)
gc.collect()

(4692, 200054)


493

### Machine learning

#### FTRL

In [36]:
kf = KFold(n_splits = 10, shuffle=True, random_state = 2701)

scores = list()
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    FTRLModel = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=300)
    FTRLModel.fit(X_train, y_train)
    
    scores.append(f1_score(y_test, np.round(FTRLModel.predict(X_test))))
    print(scores[-1])

0.8946236559139785
0.9227722772277227
0.9288537549407114
0.9099099099099098
0.9124999999999999
0.9230769230769229
0.9105691056910569
0.9301310043668123
0.924
0.9170305676855894


In [37]:
print("Accuracy: %0.2f (+/- %0.2f)" % (np.mean(scores), np.std(scores) * 2))

Accuracy: 0.92 (+/- 0.02)


In [38]:
FTRLModel = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=300)
FTRLModel.fit(X, y)

### Predict and save

In [39]:
test['en'] = FTRLModel.predict(Xtest)
test['fr'] = 1 - test['en']

In [42]:
test.sort_values('fr', ascending = False).head()

Unnamed: 0,id,text,en,fr
12133,8e62eb7ec323d3d2499422975752f378,Le port de Gdynia est sans doute susceptible d...,1.690486e-07,1.0
10302,719dfc6463ddc9f99ed97771cde35a4e,— 68 —\n\nune entente spéciale sur la base du ...,1.890985e-07,1.0
5854,d65d9fcc2adc4a75fd301021e5573e7b,= @ == ¢\n\n \n \n \n \n \n \n \n\nAprés...,2.201007e-07,1.0
3247,6b7358314612f951faea67b3d0e67ee3,On cherche actuellement 4 savoir si ces trois ...,2.615427e-07,1.0
1106,67a96a786b14bddc843ca0b49fd490d0,"— §7 —\n\nDans la Suéde méridionale, le codt t...",4.006107e-07,1.0


In [43]:
test = test[['id', 'en', 'fr']]
test.columns = ['filename', 'en', 'fr']
test['filename'] = list(map(lambda x: x+'.jpg', test.filename.tolist()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
try:
    os.mkdir('../submit')
except: None
    
test.to_csv('../submit/FTRL2609.csv', index = False)