In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from catboost import Pool, CatBoostClassifier

In [2]:
# датасет Доминики
df = pd.read_csv('bib_data_union_v4.csv.zip', compression='zip')

In [34]:
corpus = small_dataset.tokenized_record
vectorizer = CountVectorizer(ngram_range=(1, 2))
vect_df = pd.DataFrame(vectorizer.fit_transform(corpus).toarray())
# vect_df['style_name'] = small_dataset['style_name']

In [36]:
vocabulary = {}
for i, st in enumerate(set(small_dataset.style_name)):
    vocabulary[st] = i

y = pd.Series([vocabulary[st] for st in list(small_dataset.style_name)])

In [38]:
vect_df['style_name'] = y

In [39]:
vect_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,377,378,379,380,381,382,383,384,385,style_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,74
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,81
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,77
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,52
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [4]:
big_dataset, small_dataset = train_test_split(df, test_size=0.017, random_state=110)

In [6]:
NGRAM_RANGE = (2, 4)

def select_features_rf(tfidf, response, feature_names, nfeatures):
    '''Select features using feature importance from Random Forest'''

    if nfeatures >= len(feature_names):
        return feature_names
    rf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=5)
    rf_model = rf.fit(tfidf, response)
    feature_importances = np.argsort(rf_model.feature_importances_)
    feature_names = np.array(feature_names)
    feature_names = feature_names[feature_importances]
    return feature_names[-nfeatures:]


def select_features_chi2(tfidf, response, feature_names, nfeatures):
    '''Select features using Chi-squared correlations'''

    if nfeatures >= len(feature_names):
        return feature_names
    feature_names_sorted = []
    for label in list(set(response)):
        features_chi2 = chi2(tfidf, response == label)[0]
        indices = np.argsort(features_chi2)
        fns = np.array(feature_names)
        fns = fns[indices][::-1]
        feature_names_sorted.append(fns)
    feature_names = set()
    for i in range(nfeatures):
        if len(feature_names) == nfeatures:
            break
        nf = [x[i] for x in feature_names_sorted]
        for n in nf:
            if len(feature_names) == nfeatures:
                break
            feature_names.add(n)
    return feature_names


def get_tfidf_features(strings,
                       response=None,
                       count_vectorizer=None,
                       tfidf_transformer=None,
                       nfeatures=None,
                       ngrams=NGRAM_RANGE,
                       feature_selector=None):
    '''Extract TF-IDF from reference strings'''

    if count_vectorizer is None:
        # fit and calculate features (train set mode)
        freq_nfeatures = None
        if feature_selector is None:
            freq_nfeatures = nfeatures
        count_vectorizer = CountVectorizer(preprocessor=tokens_to_classes,
                                           max_features=freq_nfeatures,
                                           ngram_range=ngrams)
        counts = count_vectorizer.fit_transform(strings)
        tfidf_transformer = TfidfTransformer()
        tfidf = tfidf_transformer.fit_transform(counts)
        if feature_selector is not None and nfeatures is not None \
                and response is not None:
            # feature selection
            feature_names = count_vectorizer.get_feature_names()
            if nfeatures < len(feature_names):
                feature_names = feature_selector(tfidf, response,
                                                 feature_names, nfeatures)
            count_vectorizer = CountVectorizer(preprocessor=tokens_to_classes,
                                               ngram_range=ngrams,
                                               vocabulary=feature_names)
            counts = count_vectorizer.fit_transform(strings)
            tfidf_transformer = TfidfTransformer()
            tfidf = tfidf_transformer.fit_transform(counts)
    else:
        # calculate features (test set mode)
        counts = count_vectorizer.transform(strings)
        tfidf = tfidf_transformer.transform(counts)
    return count_vectorizer, tfidf_transformer, tfidf


def get_features(strings,
                 response=None,
                 count_vectorizer=None,
                 tfidf_transformer=None,
                 nfeatures=None,
                 ngrams=NGRAM_RANGE,
                 feature_selector=None):
    '''Extract full feature vector from reference strings'''

    count_vectorizer, tfidf_transformer, features = \
        get_tfidf_features(strings, response=response, nfeatures=nfeatures,
                           count_vectorizer=count_vectorizer,
                           tfidf_transformer=tfidf_transformer,
                           ngrams=ngrams, feature_selector=feature_selector)
    lengths = [[len(s)] for s in strings]
    features = sp.hstack((features, sp.csr_matrix(lengths)))

    return count_vectorizer, tfidf_transformer, features

In [40]:
vect_df = vect_df.reset_index()
vect_df.columns = vect_df.columns.astype(str)

In [42]:
vect_df = vect_df.drop(['index'], axis=1)

In [43]:
vect_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,377,378,379,380,381,382,383,384,385,style_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,74
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,81
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,77
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,52
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [44]:
small_X,small_y = vect_df.drop(['style_name'], axis=1), vect_df.style_name

In [45]:
small_X_train, small_X_test, small_y_train, small_y_test = train_test_split(small_X, small_y, test_size=0.2)

In [None]:
big_X, big_y = big_dataset.drop(['style_name'], axis=1), big_dataset.style_name

In [46]:
import time
from math import ceil

sgd_model = LogisticRegression(solver='liblinear', multi_class='ovr',
            random_state=0, n_jobs=8)
start = time.time()
sgd_model.fit(small_X_train, small_y_train)
finish = time.time()
print(sgd_model.score(small_X_test, small_y_test))
print("Time = ", finish - start, "s =", ceil((finish - start) / 60), "min")



0.7131107412121805
Time =  1798.1568381786346 s = 30 min


In [13]:
np.any(np.isnan(small_X.to_numpy()))

False

In [None]:
print(f1_score(small_y_test, sgd_model.predict(small_X_test), average='macro'))
print(f1_score(big_y, sgd_model.predict(big_X), average='macro'))

In [2]:
df = pd.read_csv('2grams_bib_data.csv')

In [9]:
big_df, small_df = train_test_split(df, test_size=0.017, random_state=45)

In [10]:
small_X, small_y = small_df.drop(['style_name'], axis=1), small_df.style_name
big_X, big_y = big_df.drop(['style_name'], axis=1), big_df.style_name

In [11]:
small_df.to_csv('small_df.csv')

In [5]:
small_X_train, small_X_test, small_y_train, small_y_test = train_test_split(small_X, small_y, test_size=0.2)
small_X_train, small_X_val, small_y_train, small_y_val = train_test_split(small_X_train, small_y_train, test_size=0.25)
train_dataset = Pool(data=small_X_train, label=small_y_train)
test_data = Pool(data=small_X_test, label=small_y_test)

In [None]:
boost_model = CatBoostClassifier(iterations=250,
                           learning_rate=0.05,
                           depth=4,
                           task_type='GPU',                                                
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
boost_model.fit(train_dataset, eval_set=(small_X_val, small_y_val))

In [6]:
boost_model = CatBoostClassifier()
boost_model.load_model('boost_model(2).cbm')

<catboost.core.CatBoostClassifier at 0x7f29fc6079d0>

In [7]:
print(accuracy_score(big_y, boost_model.predict(big_X)))
print(f1_score(big_y, boost_model.predict(big_X), average='macro'))

0.7059910484211966
0.7059910484211966


# Нейронная сеть sklearn

In [8]:
from sklearn.neural_network import MLPClassifier

In [18]:
clf = MLPClassifier(solver='adam', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1, max_iter=10000, activation='relu')
clf.fit(small_X_train, small_y_train)