In [1]:
#coding=utf-8
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

train = pd.read_csv("data/linear_train.txt")
# train

# encoding=utf8  
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

In [2]:
import re
vowels = set(u'аеёиоуыэюя')
sign_chars = set(u'ъь')
pattern = re.compile(u"(c*[ьъ]?vc+[ьъ](?=v))|(c*[ьъ]?v(?=v|cv))|(c*[ьъ]?vc[ъь]?(?=cv|ccv))|(c*[ьъ]?v[cьъ]*(?=$))")

def get_syllables(word):
    word = word.decode('utf-8').lower()
    mask = ''.join(['v' if c in vowels else c if c in sign_chars else 'c' for c in word.lower()])
    return ' '.join([word[m.start():m.end()] for m in pattern.finditer(mask)])


In [3]:
train['syllables'] = pd.Series([get_syllables(word.lower()) for word in list(train.word)])

In [4]:
train['lower'] = [word.decode('utf-8').lower() for word in train.word]

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(list(train['syllables']), train['class'], test_size=0.1, random_state=42)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline, FeatureUnion

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer

In [8]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [9]:
features = []

In [10]:
train[:5]

Unnamed: 0,word,class,syllables,lower
0,Аалтонен,1,а ал то нен,аалтонен
1,Аар,0,а ар,аар
2,Аарон,0,а а рон,аарон
3,ААРОН,0,а а рон,аарон
4,Аарона,0,а а ро на,аарона


In [11]:
features.append(('vs', Pipeline([
    ('sa', ItemSelector(key='syllables')),
    ('va',
        TfidfVectorizer(
            min_df=10,
            ngram_range=(1, 3),
            stop_words=None,
            decode_error='replace',
            norm='l2',
            binary=False,
            max_features=5000
        ))
])))


In [12]:
features.append(('vc', Pipeline([
    ('sb', ItemSelector(key='word')),
    ('vb',
        TfidfVectorizer(
            min_df=5,
            ngram_range=(2, 6),
            stop_words=None,
            decode_error='ignore',
            analyzer='char',
            norm='l2',
            binary=False,
            max_features=50000,
            smooth_idf = True,
            sublinear_tf = True
        ))
])))

In [13]:
feature_union = FeatureUnion(features)

In [14]:
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [15]:
#create estimator
estimators = []
estimators.append(('features', feature_union))
estimators.append(('logistic', LogisticRegression()))

model = Pipeline(estimators)

In [17]:
# specify parameters and distributions to sample from
param_dist = {
    "features__vc__vb__max_features": [100000,200000],
    "features__vc__vb__norm": ['l2'],
    "features__vc__vb__ngram_range": [(2,6),(2,4)],
    "logistic__penalty": ['l1','l2']
}

In [18]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [22]:
from time import time
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search, n_jobs=4)
start = time()
random_search.fit(train, train['class'])

print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

In [23]:
random_search.cv_results_

{'mean_fit_time': array([ 16.31048663,  15.05258036,  14.62201397,  12.47657863,
         12.73963245,  13.25814803,  11.88183268,  15.94292092,
         12.19800274,  13.60787368,  15.65605267,   9.86104004,
         15.88227129,  12.42414904,  16.32648277,  14.29498029,
         14.16658131,  12.44642901,  15.75684897,  10.96282101]),
 'mean_score_time': array([ 5.48725867,  4.78336827,  4.45470301,  4.60654608,  5.14494395,
         4.66697232,  4.24136821,  5.09113169,  4.25181166,  4.57566937,
         3.87514408,  4.10048294,  5.05168192,  4.69345665,  5.36444426,
         4.08996677,  4.47720059,  4.89074826,  4.72259529,  3.72575633]),
 'mean_test_score': array([ 0.88779978,  0.88438782,  0.88884506,  0.88703061,  0.88986076,
         0.89168507,  0.89168507,  0.88445685,  0.88708978,  0.88892395,
         0.89385453,  0.89269091,  0.88771103,  0.89306564,  0.89303605,
         0.88584727,  0.88677422,  0.8889634 ,  0.88670519,  0.88986076]),
 'mean_train_score': array([ 0.9310

In [None]:
count_vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 3)) 

In [None]:
sparse_feature_matrix = count_vectorizer.fit_transform(X_train)
sparse_feature_matrix

In [None]:
num_2_words = {
    v: k
    for k, v in count_vectorizer.vocabulary_.iteritems()
}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [None]:
algo = LogisticRegression()
algo.fit(sparse_feature_matrix, y_train)

In [None]:
algo.coef_.shape

In [None]:
import heapq

W = algo.coef_.shape[1]
c = 0
topic_words = [
    num_2_words[w_num]
    for w_num in heapq.nlargest(20, range(W), key=lambda w: algo.coef_[c, w])
]
print ',  '.join(topic_words)


In [None]:
algo = LogisticRegression()
arr = cross_val_score(algo, sparse_feature_matrix, y_train, cv=5, scoring='accuracy')
print arr
print np.mean(arr)

In [None]:
algo.fit(sparse_feature_matrix, y_train)

In [None]:
accuracy_score(algo.predict(sparse_feature_matrix), y_train)

In [None]:
accuracy_score(algo.predict(count_vectorizer.transform(X_test)), y_test)

# Предсказание

In [None]:
test = pd.read_csv("data/linear_test.txt")
test

In [None]:
test['syllables'] = pd.Series([get_syllables(word.lower()) for word in list(test.word)])

In [None]:
test

In [None]:
sparse_feature_matrix = count_vectorizer.fit_transform(train['syllables'])
sparse_feature_matrix

algo = LogisticRegression(penalty='l1', C=0.2)
algo.fit(sparse_feature_matrix, train['class'])

In [None]:
solution = pd.DataFrame()   
solution['Answer'] = pd.Series(algo.predict_proba(count_vectorizer.transform(test['syllables']))[:,1])

In [None]:
solution

In [None]:
solution.to_csv('solution5.txt')