In [34]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import pymorphy2
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/egor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Преобразование слов полей title и text к нормальной форме

In [9]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [10]:
def tolow(df, col):
    df[col] = df[col].apply(str.lower)
    return df

def delInfo(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'\(*\)',' ', x))
    return df

def delShortWords(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'\s\w{1,3}\s',' ', x))
    return df


def delPunc(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'[!,\.\?\*\(\)"“”«»:;#№\-@%\+]',' ', x))
    return df

def delGreat(df, col):
    words = [r'здравствуйте', r'добрый день', r'добрый вечер', r'доброе утро', r'приветствую']
    for el in words:
        df[col] = df[col].apply(lambda x: re.sub(el,' ', x))
    return df

def delWord(df, col, word):
    df[col] = df[col].apply(lambda x: re.sub(word,' ', x))
    return df

def delE(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'ё','е', x))
    return df

def delSpace(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'\s{2,}', ' ', x))
    return df

def delDig(df, col):
    df[col] = df[col].apply(lambda x: re.sub(r'\d+', ' ', x))
    return df

morph = pymorphy2.MorphAnalyzer()
def normalize(text):
    s = ''
    for el in text.split(' '):
        s+= morph.parse(el)[0].normal_form + ' '
    return s

def repl(x):
    category = ['Кредит', 'Ипотека', 'Реструктуризация', 'Вклад', 'Бизнес услуги', 'Обслуживание физ. и юр. лиц',
               'Дебетовая карта', 'Денежные переводы', 'Инвестиционные продукты']
    return category.index(x)


for el, col in zip([train, test, train, test], ['title', 'text', 'text', 'title']):
    for f in [tolow, delInfo,  delPunc, delGreat, delE, delDig, delShortWords, delSpace]:
        el = f(el, col)
    el[col] = el[col].apply(lambda x: normalize(x))
    
train['type'] = train['type'].apply(repl)

# Нахождение популярных осмысленных слов каждой категории для признаков

In [31]:
classif_train = train

In [36]:
def tag(string):
    a = morph.parse(string)[0].tag
    return (('NOUN' in a) or ('ADJF' in a) or ('INFN' in a))

words_0 = pd.Series(np.concatenate(classif_train[classif_train['type']==0]['text'].apply(lambda s: s.strip().split()).values))
words_1 = pd.Series(np.concatenate(classif_train[classif_train['type']==1]['text'].apply(lambda s: s.strip().split()).values))
words_2 = pd.Series(np.concatenate(classif_train[classif_train['type']==2]['text'].apply(lambda s: s.strip().split()).values))
words_3 = pd.Series(np.concatenate(classif_train[classif_train['type']==3]['text'].apply(lambda s: s.strip().split()).values))
words_4 = pd.Series(np.concatenate(classif_train[classif_train['type']==4]['text'].apply(lambda s: s.strip().split()).values))
words_5 = pd.Series(np.concatenate(classif_train[classif_train['type']==5]['text'].apply(lambda s: s.strip().split()).values))
words_6 = pd.Series(np.concatenate(classif_train[classif_train['type']==6]['text'].apply(lambda s: s.strip().split()).values))
words_7 = pd.Series(np.concatenate(classif_train[classif_train['type']==7]['text'].apply(lambda s: s.strip().split()).values))
words_8 = pd.Series(np.concatenate(classif_train[classif_train['type']==8]['text'].apply(lambda s: s.strip().split()).values))

words_0 = words_0[~words_0.isin(stopwords.words('russian'))]
words_1 = words_1[~words_1.isin(stopwords.words('russian'))]
words_2 = words_2[~words_2.isin(stopwords.words('russian'))]
words_3 = words_3[~words_3.isin(stopwords.words('russian'))]
words_4 = words_4[~words_4.isin(stopwords.words('russian'))]
words_5 = words_5[~words_5.isin(stopwords.words('russian'))]
words_6 = words_6[~words_6.isin(stopwords.words('russian'))]
words_7 = words_7[~words_7.isin(stopwords.words('russian'))]
words_8 = words_8[~words_8.isin(stopwords.words('russian'))]

words_0 = words_0[words_0.str.isalpha()] # Только буквы
words_1 = words_1[words_1.str.isalpha()] # Только буквы
words_2 = words_2[words_2.str.isalpha()] # Только буквы
words_3 = words_3[words_3.str.isalpha()] # Только буквы
words_4 = words_4[words_4.str.isalpha()] # Только буквы
words_5 = words_5[words_5.str.isalpha()] # Только буквы
words_6 = words_6[words_6.str.isalpha()] # Только буквы
words_7 = words_7[words_7.str.isalpha()] # Только буквы
words_8 = words_8[words_8.str.isalpha()] # Только буквы

words_0 = words_0[(words_0.str.len() >= 3)] # Длина > 2
words_1 = words_1[(words_1.str.len() >= 3)] # Длина > 2
words_2 = words_2[(words_2.str.len() >= 3)] # Длина > 2
words_3 = words_3[(words_3.str.len() >= 3)] # Длина > 2
words_4 = words_4[(words_4.str.len() >= 3)] # Длина > 2
words_5 = words_5[(words_5.str.len() >= 3)] # Длина > 2
words_6 = words_6[(words_6.str.len() >= 3)] # Длина > 2
words_7 = words_7[(words_7.str.len() >= 3)] # Длина > 2
words_8 = words_8[(words_8.str.len() >= 3)] # Длина > 2

words_0 = words_0[words_0.apply(tag)]
words_1 = words_1[words_1.apply(tag)]
words_2 = words_2[words_2.apply(tag)]
words_3 = words_3[words_3.apply(tag)]
words_4 = words_4[words_4.apply(tag)]
words_5 = words_5[words_5.apply(tag)]
words_6 = words_6[words_6.apply(tag)]
words_7 = words_7[words_7.apply(tag)]
words_8 = words_8[words_8.apply(tag)]

df = pd.DataFrame(columns = ['words_0','words_1','words_2','words_3','words_4','words_5','words_6','words_7','words_8'])

df['words_0'] = words_0
df['words_1'] = words_1
df['words_2'] = words_2
df['words_3'] = words_3
df['words_4'] = words_4
df['words_5'] = words_5
df['words_6'] = words_6
df['words_7'] = words_7
df['words_8'] = words_8

count_0 = df['words_0'].value_counts()
count_1 = df['words_1'].value_counts()
count_2 = df['words_2'].value_counts()
count_3 = df['words_3'].value_counts()
count_4 = df['words_4'].value_counts()
count_5 = df['words_5'].value_counts()
count_6 = df['words_6'].value_counts()
count_7 = df['words_7'].value_counts()
count_8 = df['words_8'].value_counts()

words_all = pd.Series()
words_all = pd.concat([words_0,words_all],axis = 0)
words_all = pd.concat([words_1,words_all],axis = 0)
words_all = pd.concat([words_2,words_all],axis = 0)
words_all = pd.concat([words_3,words_all],axis = 0)
words_all = pd.concat([words_4,words_all],axis = 0)
words_all = pd.concat([words_5,words_all],axis = 0)
words_all = pd.concat([words_6,words_all],axis = 0)
words_all = pd.concat([words_7,words_all],axis = 0)
words_all = pd.concat([words_8,words_all],axis = 0)
words_all = set(words_all)

df_count = pd.DataFrame(columns = ['count_0','count_1','count_2','count_3','count_4','count_5','count_6','count_7', 'count_8'], index = words_all)

df_count['count_0'] = count_0
df_count['count_1'] = count_1
df_count['count_2'] = count_2
df_count['count_3'] = count_3
df_count['count_4'] = count_4
df_count['count_5'] = count_5
df_count['count_6'] = count_6
df_count['count_7'] = count_7
df_count['count_8'] = count_8

df_count = df_count.sort_values(by=["count_0"],ascending = False)

df_count = df_count.sort_values(by=["count_0"],ascending = False)
kredit = df_count[0:100].index
df_count = df_count.sort_values(by=["count_1"],ascending = False)
ipoteka = df_count[0:100].index
df_count = df_count.sort_values(by=["count_2"],ascending = False)
restruct = df_count[0:100].index
df_count = df_count.sort_values(by=["count_3"],ascending = False)
vklad = df_count[0:100].index
df_count = df_count.sort_values(by=["count_4"],ascending = False)
bisnez = df_count[0:100].index
df_count = df_count.sort_values(by=["count_5"],ascending = False)
obslu = df_count[0:100].index
df_count = df_count.sort_values(by=["count_6"],ascending = False)
karta = df_count[0:100].index
df_count = df_count.sort_values(by=["count_7"],ascending = False)
perevod = df_count[0:100].index
df_count = df_count.sort_values(by=["count_8"],ascending = False)
invest = df_count[0:100].index

kredit = set(kredit)
ipoteka = set(ipoteka)
restruct = set(restruct)
vklad = set(vklad)
bisnez = set(bisnez)
obslu = set(obslu)
karta = set(karta)
perevod = set(perevod)
invest = set(invest)
main_words = set()

main_words = main_words.union(kredit,ipoteka,restruct,vklad,bisnez,obslu,karta,perevod,invest);
same_words = kredit.intersection(kredit,ipoteka,restruct,vklad,bisnez,obslu,karta,perevod,invest)
main_words = main_words.difference(same_words)
main_words = pd.Series(list(main_words))
final_words = pd.DataFrame(columns = ['words'])
final_words['words'] = main_words

# Добавление новых признаков, обучение модели и предсказание результатов

In [43]:
df = train
test = test
words = np.array(final_words)

In [44]:
df['text'] = df.apply(lambda x: x['title'] + ' ' + x['text'], axis=1)
test['text'] = test.apply(lambda x: x['title'] + ' ' + x['text'], axis=1)

for i in words:
    df[i[0]] = df['text'].apply(lambda x: int(i[0] in x))
for i in words:
    test[i[0]] = test['text'].apply(lambda x: int(i[0] in x))
    
df = df.drop('title', 1)
df = df.drop('text', 1)
df = df.drop('Unnamed: 0', 1)
ids = test['Unnamed: 0']
test = test.drop('title', 1)
test = test.drop('text', 1)
test = test.drop('Unnamed: 0', 1)

X_train = df.drop('type', axis=1)
y_train = df['type']

logistic = LogisticRegression(C=0.2)
logistic.fit(X_train, y_train)
answers = logistic.predict(test)

types = ['Кредит', 'Ипотека', 'Реструктуризация','Вклад','Бизнес услуги','Обслуживание физ. и юр. лиц','Дебетовая карта','Денежные переводы','Инвестиционные продукты']
answers = list(map(lambda x: types[x], answers))

sub = pd.DataFrame({'index': range(0, len(answers)), 'type': answers}) 
sub.to_csv('classif.csv', index=False)