In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier, SGDRegressor, LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from catboost import Pool, CatBoostClassifier

In [2]:
# датасет Доминики
df = pd.read_csv('bib_data_union_v4.csv.zip', compression='zip')

In [11]:
# наш датасет
df = pd.read_csv('bib_data_union_v3.csv.zip', compression='zip')

In [9]:
df = small_dataset

corpus = df.tokenized_record
vectorizer = CountVectorizer(tokenizer=lambda txt: txt.split(), ngram_range=(1, 2))
vect_df = pd.DataFrame(vectorizer.fit_transform(corpus).toarray())
vect_df['style_name'] = df['style_name']



In [9]:
vect_df.to_csv('2grams_bib_data.csv')

In [10]:
vect_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,384,style_name
0,2,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,2,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,


In [3]:
big_dataset, small_dataset = train_test_split(df, test_size=0.017, random_state=201)

In [9]:
small_dataset.to_csv('small_vect_df.csv')

In [8]:
NGRAM_RANGE = (2, 4)

def select_features_rf(tfidf, response, feature_names, nfeatures):
    '''Select features using feature importance from Random Forest'''

    if nfeatures >= len(feature_names):
        return feature_names
    rf = RandomForestClassifier(n_estimators=200, max_depth=3, random_state=5)
    rf_model = rf.fit(tfidf, response)
    feature_importances = np.argsort(rf_model.feature_importances_)
    feature_names = np.array(feature_names)
    feature_names = feature_names[feature_importances]
    return feature_names[-nfeatures:]


def select_features_chi2(tfidf, response, feature_names, nfeatures):
    '''Select features using Chi-squared correlations'''

    if nfeatures >= len(feature_names):
        return feature_names
    feature_names_sorted = []
    for label in list(set(response)):
        features_chi2 = chi2(tfidf, response == label)[0]
        indices = np.argsort(features_chi2)
        fns = np.array(feature_names)
        fns = fns[indices][::-1]
        feature_names_sorted.append(fns)
    feature_names = set()
    for i in range(nfeatures):
        if len(feature_names) == nfeatures:
            break
        nf = [x[i] for x in feature_names_sorted]
        for n in nf:
            if len(feature_names) == nfeatures:
                break
            feature_names.add(n)
    return feature_names


def get_tfidf_features(strings,
                       response=None,
                       count_vectorizer=None,
                       tfidf_transformer=None,
                       nfeatures=None,
                       ngrams=NGRAM_RANGE,
                       feature_selector=None):
    '''Extract TF-IDF from reference strings'''

    if count_vectorizer is None:
        # fit and calculate features (train set mode)
        freq_nfeatures = None
        if feature_selector is None:
            freq_nfeatures = nfeatures
        count_vectorizer = CountVectorizer(preprocessor=tokens_to_classes,
                                           max_features=freq_nfeatures,
                                           ngram_range=ngrams)
        counts = count_vectorizer.fit_transform(strings)
        tfidf_transformer = TfidfTransformer()
        tfidf = tfidf_transformer.fit_transform(counts)
        if feature_selector is not None and nfeatures is not None \
                and response is not None:
            # feature selection
            feature_names = count_vectorizer.get_feature_names()
            if nfeatures < len(feature_names):
                feature_names = feature_selector(tfidf, response,
                                                 feature_names, nfeatures)
            count_vectorizer = CountVectorizer(preprocessor=tokens_to_classes,
                                               ngram_range=ngrams,
                                               vocabulary=feature_names)
            counts = count_vectorizer.fit_transform(strings)
            tfidf_transformer = TfidfTransformer()
            tfidf = tfidf_transformer.fit_transform(counts)
    else:
        # calculate features (test set mode)
        counts = count_vectorizer.transform(strings)
        tfidf = tfidf_transformer.transform(counts)
    return count_vectorizer, tfidf_transformer, tfidf


def get_features(strings,
                 response=None,
                 count_vectorizer=None,
                 tfidf_transformer=None,
                 nfeatures=None,
                 ngrams=NGRAM_RANGE,
                 feature_selector=None):
    '''Extract full feature vector from reference strings'''

    count_vectorizer, tfidf_transformer, features = \
        get_tfidf_features(strings, response=response, nfeatures=nfeatures,
                           count_vectorizer=count_vectorizer,
                           tfidf_transformer=tfidf_transformer,
                           ngrams=ngrams, feature_selector=feature_selector)
    lengths = [[len(s)] for s in strings]
    features = sp.hstack((features, sp.csr_matrix(lengths)))

    return count_vectorizer, tfidf_transformer, features

In [11]:
small_dataset = vect_df
small_X,small_y = small_dataset.drop(['style_name'], axis=1), small_dataset.style_name

In [12]:
small_X_train, small_X_test, small_y_train, small_y_test = train_test_split(small_X, small_y, test_size=0.3)

In [6]:
big_X, big_y = big_dataset.drop(['style_name'], axis=1), big_dataset.style_name

In [13]:
import time
from math import ceil

start = time.time()
sgd_model = LogisticRegression(solver='liblinear', multi_class='ovr',
            random_state=0, n_jobs=8)
sgd_model.fit(small_X_train, small_y_train)
finish = time.time()
# joblib.dump(sgd_model, 'dominika_model_our_ds.sav')
print(sgd_model.score(small_X_test, small_y_test))
# print(sgd_model.score(big_X, big_y))
print(finish - start, "s = ", ceil((finish - start) / 60) + "min")

ValueError: Input contains NaN

In [8]:
print(classification_report(small_y_test, sgd_model.predict(small_X_test)))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   IEEEannot       0.00      0.00      0.00       382
    IEEEtran       0.02      0.42      0.04       387
   IEEEtranN       0.02      0.00      0.00       370
   IEEEtranS       0.00      0.00      0.00       349
  IEEEtranSA       0.00      0.00      0.00       381
  IEEEtranSN       0.01      0.10      0.02       399
      JHEP-2       0.00      0.00      0.00       362
  aaai-named       0.00      0.00      0.00       340
    abstract       0.00      0.00      0.00       368
    acmtrans       0.00      0.00      0.00       331
      aichej       0.00      0.00      0.00       327
         aip       0.00      0.00      0.00       368
    alphanum       0.00      0.00      0.00       397
         ama       0.00      0.00      0.00       329
    amsalpha       0.00      0.00      0.00       262
    amsplain       0.00      0.00      0.00       270
    annotate       0.00      0.00      0.00       357
  annotation       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(classification_report(big_y, sgd_model.predict(big_X)))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   IEEEannot       0.00      0.00      0.00     68214
    IEEEtran       0.02      0.42      0.04     71885
   IEEEtranN       0.02      0.00      0.01     72192
   IEEEtranS       0.00      0.00      0.00     71399
  IEEEtranSA       0.00      0.00      0.00     73342
  IEEEtranSN       0.01      0.12      0.02     73041
      JHEP-2       0.00      0.00      0.00     69699
  aaai-named       0.00      0.00      0.00     68715
    abstract       0.50      0.00      0.00     68980
    acmtrans       0.00      0.00      0.00     70132
      aichej       0.00      0.00      0.00     67093
         aip       0.00      0.00      0.00     70589
    alphanum       0.00      0.00      0.00     68850
         ama       0.00      0.00      0.00     68383
    amsalpha       0.00      0.00      0.00     52899
    amsplain       0.00      0.00      0.00     53169
    annotate       0.00      0.00      0.00     69499
  annotation       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
df = pd.read_csv('2grams_bib_data.csv')

In [3]:
big_df, small_df = train_test_split(df, test_size=0.017, random_state=52)

In [None]:
small_X, small_y = small_df.drop(['style_name'], axis=1), small_df.style_name
big_X, big_y = big_df.drop(['style_name'], axis=1), big_df.style_name

In [17]:
small_df.to_csv('small_df_2.csv')

In [None]:
# small_X_train, small_X_test, small_y_train, small_y_test = train_test_split(small_X, small_y, test_size=0.3)
small_X_train, small_X_val, small_y_train, small_y_val = train_test_split(small_X_train, small_y_train, test_size=0.25)
train_dataset = Pool(data=small_X_train, label=small_y_train)
test_data = Pool(data=small_X_test, label=small_y_test)

In [None]:
boost_model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.05,
                           depth=4,
                           task_type='GPU',                                                
                           loss_function='MultiClass',
                           eval_metric='Accuracy')
boost_model.fit(train_dataset, eval_set=(small_X_val, small_y_val))

In [None]:
print(classification_report(small_y_test, boost_model.predict(small_X_test)))

In [None]:
print(classification_report(big_y, boost_model.predict(big_X)))

In [13]:
boost_model = CatBoostClassifier()
boost_model.load_model('boost_model_50.cbm')

<catboost.core.CatBoostClassifier at 0x7fe517ad3910>