### Load

In [1]:
import pandas as pd, numpy as np

In [2]:
df = pd.read_csv('data/train.tsv', sep='\t', quotechar=' ', header = None)
df.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply', 'label', 'confidence']
test = pd.read_csv('data/public.tsv', sep='\t', quotechar = ' ', header = None)
test.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']

In [3]:
df.fillna('', inplace=True)
test.fillna('', inplace=True)

### Preprocessing

#### Labeling target

In [4]:
def label_enc(x ,reverse = False):
    if reverse == False:
        if x == 'bad':
            return 0
        elif x == 'neutral':
            return 1
        else:
            return 2
    else:
        if x == 0:
            return 'bad'
        elif x == 1:
            return 'neutral'
        else:
            return 'good'

In [5]:
df['label'] = df['label'].apply(label_enc)
df['target'] = df['label'] * df['confidence']

#### FastText

In [6]:
import fastText, re

In [7]:
ft_model = fastText.load_model("./fastText/cc.ru.300.bin")

In [8]:
def pre(s):
    return re.sub(r'[^\w]', ' ', s)

In [9]:
%%time

t1_ft = np.vstack(df['context_2'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
t2_ft = np.vstack(df['context_1'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
t3_ft = np.vstack(df['context_0'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
t5_ft = np.vstack(df['reply'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))

te1_ft = np.vstack(test['context_2'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
te2_ft = np.vstack(test['context_1'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
te3_ft = np.vstack(test['context_0'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
te5_ft = np.vstack(test['reply'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))

Train = np.hstack([t1_ft, t2_ft, t3_ft, t5_ft])
Test  = np.hstack([te1_ft, te2_ft, te3_ft, te5_ft])

del ft_model

CPU times: user 5.68 s, sys: 399 ms, total: 6.08 s
Wall time: 5.98 s


#### Train, test

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(Train, df['target'], test_size=0.35, random_state=42)

### Cross-Validation

In [12]:
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import ARDRegression, ElasticNet, LinearRegression, LogisticRegression, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.svm import LinearSVR
import xgboost as xgb

from sklearn.model_selection import cross_val_score, KFold

#### testing regressors

In [15]:
def calculate_cv(X, y, job = 2):
    cv = KFold(n_splits=6)
    results = {
        'adb': [],
        'bag': [],
        'ext': [],
        'grad': [],
        'rf': [],
        'ard': [],
        'en': [],
        'lir': [],
        'lor': [],
        'sgd': [],
        'kn': [],
        'rnr': [],
        'svm': [],
        'xgbr': []
    }
    
    adb = AdaBoostRegressor()
    bag = BaggingRegressor()
    ext = ExtraTreesRegressor()
    grad = GradientBoostingRegressor()
    rf = RandomForestRegressor()
    ard = ARDRegression()
    en = ElasticNet()
    lir = LinearRegression()
    lor = LogisticRegression()
    sgd = SGDRegressor()
    kn = KNeighborsRegressor()
    rnr = RadiusNeighborsRegressor()
    svm = LinearSVR()
    xgbr = xgb.XGBRegressor()

    results['adb'].append(cross_val_score(adb, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['bag'].append(cross_val_score(bag, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['ext'].append(cross_val_score(ext, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['grad'].append(cross_val_score(grad,X,y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['rf'].append(cross_val_score(rf, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
   #results['ard'].append(cross_val_score(ard, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['en'].append(cross_val_score(en, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['lir'].append(cross_val_score(lir, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
   #results['lor'].append(cross_val_score(lor, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['sgd'].append(cross_val_score(sgd, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['kn'].append(cross_val_score(kn, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
   #results['rnr'].append(cross_val_score(rnr, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['svm'].append(cross_val_score(svm, X, y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    results['xgbr'].append(cross_val_score(xgbr,X,y, cv=cv, scoring='mean_absolute_error', n_jobs=job).mean())
    
    return results

In [16]:
results = calculate_cv(X_train, y_train, job = -1)

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample

  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [17]:
results

{'adb': [-0.7526395078262961],
 'bag': [-0.7222857182367132],
 'ext': [-0.7161385737150779],
 'grad': [-0.7237363651684822],
 'rf': [-0.7216018520929932],
 'ard': [],
 'en': [-0.7642601310076431],
 'lir': [-0.732830314376963],
 'lor': [],
 'sgd': [-0.7463088323190555],
 'kn': [-0.7169034732507672],
 'rnr': [],
 'svm': [-0.7237671980581929],
 'xgbr': [-0.7237175691482104]}

#### Gridsearch

In [13]:
from sklearn.model_selection import RandomizedSearchCV
import datetime

In [16]:
def Testing_grid_en(X_train, Y_train):
    cv = KFold(n_splits=4)
    clf = ElasticNet()        
    parameters = {
        'alpha': np.linspace(1, 2, 100),
        'copy_X': [True, False],
        'fit_intercept': [True, False],
        'l1_ratio': np.linspace(0, 1, 50),
        'max_iter': [1000, 1500, 2000, 3000], 
        'normalize': [False, True],
        'positive': [False, True], 
        'precompute': [False,True],
        'selection': ['cyclic', 'random'],
        'tol': np.linspace(0, 0.1, 1000),
        'warm_start': [False, True]
    }
    
    start_time = datetime.datetime.now()
    gs = RandomizedSearchCV(clf, scoring='neg_mean_squared_error', param_distributions=parameters, 
                      cv=cv, n_jobs = -1, verbose=True,  n_iter=3000)
    gs.fit(X_train,Y_train)
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    
    means = gs.cv_results_['mean_test_score']
    
    print (max(means))
    return gs.best_estimator_

In [17]:
clf = Testing_grid_en(X_train, y_train)

Fitting 4 folds for each of 3000 candidates, totalling 12000 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   30.1s
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.1min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 22.4min


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 29.5min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 40.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 54.0min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 59.8min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 87.3min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 103.3min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 117.6min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 6026 tasks      | elapsed: 181.9min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 7176 tasks      | elapsed: 198.7min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


[Parallel(n_jobs=-1)]: Done 8426 tasks      | elapsed: 240.4min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
[Parallel(n_jobs=-1)]: Done 9776 tasks      | elapsed: 282.1min
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


[Parallel(n_jobs=-1)]: Done 11226 tasks      | elapsed: 307.5min
[Parallel(n_jobs=-1)]: Done 12000 out of 12000 | elapsed: 350.2min finished
  tol, rng, random, positive)


Time elapsed: 5:50:15.525958
-0.6798048256091412




In [27]:
clf.fit(Train, df['target'])
y_test = clf.predict(Test)

In [28]:
sub = pd.DataFrame()

In [29]:
test

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply
0,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,0,неа .
1,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,1,"нет , не хочу ."
2,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,2,нет .
3,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,3,"конечно , нет ."
4,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,4,"разумеется , нет ."
5,138920940977,"знаешь , я иногда подумываю , что тебе надо пр...",не - а .,нет ?,5,"да , нет ."
6,178951117610,ему не хватало внимания .,это ответы ника с первого теста .,"по нашему опыту , если бы ему "" не хватало вни...",0,он бы остановился .
7,178951117610,ему не хватало внимания .,это ответы ника с первого теста .,"по нашему опыту , если бы ему "" не хватало вни...",1,"но наш анализ показывает , что его ответы не б..."
8,178951117610,ему не хватало внимания .,это ответы ника с первого теста .,"по нашему опыту , если бы ему "" не хватало вни...",2,"точно , не хватает ."
9,178951117610,ему не хватало внимания .,это ответы ника с первого теста .,"по нашему опыту , если бы ему "" не хватало вни...",3,"он бы разглядывал её , она бы чувствовала себя..."


In [30]:
sub['context_id'] = test['context_id']
sub['reply_id'] = test['reply_id']
sub['rank'] = - y_test

In [31]:
submission = sub.sort_values(by=['context_id', 'rank'])

In [32]:
del submission['rank']

In [33]:
submission.to_csv('yandex-ml-naive-fasttext.tsv',header=None, index=False, sep=' ')