In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier

import pymorphy2
from sklearn.preprocessing import LabelEncoder

#metrics
from sklearn.metrics import roc_auc_score

#splitting
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

In [2]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

sample = pd.read_csv('input/sample_submission.csv')

## Features

### First and last letter

In [3]:
train['starts_from_upper'] = train['Word'].apply(lambda x: int(x.istitle()))
train['is_upper'] = train['Word'].apply(lambda x: int(x.isupper()))

test['starts_from_upper'] = test['Word'].apply(lambda x: int(x.istitle()))
test['is_upper'] = test['Word'].apply(lambda x: int(x.isupper()))

### pymorphy

In [4]:
morph = pymorphy2.MorphAnalyzer()

def name_score(word):
    for p in morph.parse(word):
        if 'Name' in p.tag:
            return p.score
    return 0

def surn_score(word):
    for p in morph.parse(word):
        if 'Surn' in p.tag:
            return p.score
    return 0

train['pymorphy_word_is_known'] = train['Word'].apply(morph.word_is_known).astype('int8')
train['pymorphy_count_in_tag'] = train['Word'].apply(lambda x: len(morph.tag(x))).astype('int8')
train['pymorphy_score'] = train['Word'].apply(lambda x: morph.parse(x)[0].score)
train['pymorphy'] = train['Word'].apply(lambda x: morph.tag(x)[0])

train['pymorphy_animacy'] = train['pymorphy'].apply(lambda x: x.animacy)
train['pymorphy_POS'] = train['pymorphy'].apply(lambda x: x.POS)
train['pymorphy_case'] = train['pymorphy'].apply(lambda x: x.case)
train['pymorphy_number'] = train['pymorphy'].apply(lambda x: x.number)
train['pymorphy_gender'] = train['pymorphy'].apply(lambda x: x.gender)

train['pymorphy_name_score'] = train['Word'].apply(name_score)
train['pymorphy_surn_score'] = train['Word'].apply(surn_score)

columns_to_one_hot = ['pymorphy' , 'pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    train[col] = LabelEncoder().fit_transform(list(train[col].fillna('nan')))

In [13]:
morph = pymorphy2.MorphAnalyzer()

def name_score(word):
    for p in morph.parse(word):
        if 'Name' in p.tag:
            return p.score
    return 0

def surn_score(word):
    for p in morph.parse(word):
        if 'Surn' in p.tag:
            return p.score
    return 0

test['pymorphy_word_is_known'] = test['Word'].apply(morph.word_is_known).astype('int8')
test['pymorphy_count_in_tag'] = test['Word'].apply(lambda x: len(morph.tag(x))).astype('int8')
test['pymorphy_score'] = test['Word'].apply(lambda x: morph.parse(x)[0].score)
test['pymorphy'] = test['Word'].apply(lambda x: morph.tag(x)[0])

test['pymorphy_animacy'] = test['pymorphy'].apply(lambda x: x.animacy)
test['pymorphy_POS'] = test['pymorphy'].apply(lambda x: x.POS)
test['pymorphy_case'] = test['pymorphy'].apply(lambda x: x.case)
test['pymorphy_number'] = test['pymorphy'].apply(lambda x: x.number)
test['pymorphy_gender'] = test['pymorphy'].apply(lambda x: x.gender)

test['pymorphy_name_score'] = test['Word'].apply(name_score)
test['pymorphy_surn_score'] = test['Word'].apply(surn_score)

columns_to_one_hot = ['pymorphy' , 'pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    test[col] = LabelEncoder().fit_transform(list(test[col].fillna('nan')))

### Beginning and ending of the word

In [5]:
for numberOfLast in [1, 2, 3, 4, 5]:
    train[str(numberOfLast)] = train['Word'].apply(lambda x: x[-numberOfLast:].lower())
    test[str(numberOfLast)] = test['Word'].apply(lambda x: x[-numberOfLast:].lower())
    
    tmp = train.groupby([str(numberOfLast)], as_index = False).count()
    
    train = pd.merge(train, tmp[[str(numberOfLast), 'Word']], on = [str(numberOfLast)], how = 'left', suffixes = ('', '_sur' + str(numberOfLast)))
    test = pd.merge(test, tmp[[str(numberOfLast), 'Word']], on = [str(numberOfLast)], how = 'left', suffixes = ('', '_sur' + str(numberOfLast)))

    train.rename(columns = {'Word_sur' +  str(numberOfLast): 'surname_ending' + str(numberOfLast)}, inplace = True)
    test.rename(columns = {'Word_sur' +  str(numberOfLast): 'surname_ending' + str(numberOfLast)}, inplace = True)
    
    train.drop(str(numberOfLast), inplace = True, axis = 1)
    test.drop(str(numberOfLast), inplace = True, axis = 1)

In [6]:
for numberOfFirst in [1, 2, 3, 4, 5]:
    train[str(numberOfFirst)] = train['Word'].apply(lambda x: x[:numberOfFirst].lower())
    test[str(numberOfFirst)] = test['Word'].apply(lambda x: x[:numberOfFirst].lower())
    
    tmp = train.groupby([str(numberOfFirst)], as_index = False).count()
    
    train = pd.merge(train, tmp[[str(numberOfFirst), 'Word']], on = [str(numberOfFirst)], how = 'left', suffixes = ('', '_sur' + str(numberOfFirst)))
    test = pd.merge(test, tmp[[str(numberOfFirst), 'Word']], on = [str(numberOfFirst)], how = 'left', suffixes = ('', '_sur' + str(numberOfFirst)))

    train.rename(columns = {'Word_sur' +  str(numberOfFirst): 'surname_beginning' + str(numberOfFirst)}, inplace = True)
    test.rename(columns = {'Word_sur' +  str(numberOfFirst): 'surname_beginning' + str(numberOfFirst)}, inplace = True)
    
    train.drop(str(numberOfFirst), inplace = True, axis = 1)
    test.drop(str(numberOfFirst), inplace = True, axis = 1)

In [7]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

### Vowels and consonants

In [8]:
vowels = 'аеёиоуыэюя'
consonants = 'бвгджзйклмнпрстфхцчшщъь'

alphabet = set(vowels) | set(consonants)

In [9]:
def vowels_count(x):
    return len([letter for letter in x.lower() if letter in vowels])

def consonant_count(x):
    return len([letter for letter in x.lower() if letter in consonants])


train['vowels_num'] = train['Word'].apply(vowels_count)
test['vowels_num'] = test['Word'].apply(vowels_count)

train['consonants_num'] = train['Word'].apply(consonant_count)
test['consonants_num'] = test['Word'].apply(consonant_count)

train['lenght'] = train['Word'].apply(len)
test['lenght'] = test['Word'].apply(len)

## Validation

In [19]:
y = 'Label'
col = train.columns.drop(['Label', 'Word'])


In [20]:
lightGBM = LGBMClassifier()
cv = StratifiedKFold(4 ,shuffle=True, random_state=19)
lightGBM_score = cross_val_score(lightGBM, train[col], train[y], cv = cv, scoring = 'roc_auc')
lightGBM_score.mean(), lightGBM_score.std()

(0.9529779578609378, 0.0016771281917307668)

## Submit

In [21]:
lightGBM = LGBMClassifier()
lightGBM.fit(train[col], train[y])
prediction = lightGBM.predict_proba(test[col])[:,1]
sample['Prediction'] = prediction
sample.to_csv('output/baseline.csv', index = False)