In [None]:
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import datetime as dt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from pymystem3 import Mystem
from string import punctuation
from sklearn.feature_selection import chi2
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSpli
from sklearn.manifold import TSNE

In [None]:
def plot_dim_red(model, features, labels, n_components=2):

    # Creation of the model
    if (model == 'PCA'):
        mod = PCA(n_components=n_components)
        title = "PCA decomposition"  # for the plot

    elif (model == 'TSNE'):
        mod = TSNE(n_components=2)
        title = "t-SNE decomposition"

    else:
        return "Error"

    # Fit and transform the features
    principal_components = mod.fit_transform(features)

    # Put them into a dataframe
    df_features = pd.DataFrame(data=principal_components,
                     columns=['PC1', 'PC2'])

    # Now we have to paste each row's label and its meaning
    # Convert labels array to df
    df_labels = pd.DataFrame(data=labels,
                             columns=['label'])

    df_full = pd.concat([df_features, df_labels], axis=1)
    df_full['label'] = df_full['label'].astype(str)

    # Get labels name
    category_names = {
    "0":'Hospital',
    '1':'COVID',
    '2':'Retail',
    '3':'Reimbursement',
    '4':'Program'
}

    # And map labels
    df_full['label_name'] = df_full['label']
    df_full = df_full.replace({'label_name':category_names})

    # Plot
    plt.figure(figsize=(10,10))
    sns.scatterplot(x='PC1',
                    y='PC2',
                    hue="label_name",
                    data=df_full,
                    palette=["red", "yellow", "royalblue", "greenyellow", "lightseagreen"],
                    alpha=.7).set_title(title)

#Create lemmatizer and stopwords list
mystem = Mystem()
russian_stopwords = stopwords.words("russian")

#Preprocess function
def preprocess_text(text):
    tokens = mystem.lemmatize(text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords\
              and token != " " \
              and token.strip() not in punctuation]

    text = " ".join(tokens)

    return text

In [None]:
im_Teders = dataiku.Dataset("IM_Teders")
im_Teders_df = im_Teders.get_dataframe()
im_Teders_df_copy = im_Teders_df

im_Teders_df['Prod_Package'] = im_Teders_df['Prod_RefStd_FormName'].str.extract(r'(№\d+)')
im_Teders_df['Prod_Package'] = im_Teders_df['Prod_Package'].str.replace('№','')
im_Teders_df['Prod_Package'][(im_Teders_df['Prod_Package'].isna())|
                  (im_Teders_df['Prod_Package'] == 'не определено')] = 1
im_Teders_df['Prod_Package'] = im_Teders_df['Prod_Package'].astype(float)
im_Teders_df['LotSpec_Doses'] = im_Teders_df['LotSpec_Num']
im_Teders_df['Prod_Doses'] = im_Teders_df['Prod_NumPack'] * im_Teders_df['Prod_Package']


im_Teders_df.loc[im_Teders_df['LotSpec_MNN_InnE'] == 'Rivaroxaban', 'LotSpec_DOT'] = im_Teders_df['LotSpec_Doses']
im_Teders_df.loc[im_Teders_df['LotSpec_MNN_InnE'].isin(['Warfarin', 'Dabigatran etexilate', 'Apixaban']), 'LotSpec_DOT'] = im_Teders_df['LotSpec_Doses']/2

im_Teders_df.loc[im_Teders_df['LotSpec_MNN_InnE'] == 'Rivaroxaban', 'Prod_DOT'] = im_Teders_df['Prod_Doses']
im_Teders_df.loc[im_Teders_df['LotSpec_MNN_InnE'].isin(['Warfarin', 'Dabigatran etexilate', 'Apixaban']), 'Prod_DOT'] = im_Teders_df['Prod_Doses']/2

im_Teders_df = im_Teders_df[im_Teders_df['Contr_SignDate']  >= '2019-12-01' ]
im_Teders_df = im_Teders_df[im_Teders_df['LotSpec_MNN_InnE'].isin(['Warfarin', 'Dabigatran etexilate',
                                                                  'Rivaroxaban', 'Apixaban'])]


im_Teders_df['Contr_SignDate'] = pd.to_datetime(im_Teders_df['Contr_SignDate'])
im_Teders_df['Месяц_подписания_контракта'] = im_Teders_df['Contr_SignDate'].dt.month
im_Teders_df['Год_подписания_контракта'] = im_Teders_df['Contr_SignDate'].dt.year

df = pd.DataFrame()
df = im_Teders_df_copy[[
       'Lot_LotStructure', 'Lot_LotType', 'Lot_LotNm', 'Lot_SupplyReglament',
       'Lot_LotRegion', 'Lot_PlanName', 'Lot_PlanTVal', 'Lot_StorageLifeNM',
       'Lot_StorageLifeText', 'Tender_FormT_name',
       'Tender_ChannelFinanceNm', 'Tender_FO_Nm',
        'Tender_Budgets_Name', 'Tender_TenderDocReglament',
       'Tender_TendNm',  'Tender_NSI_Law',
       'Prod_Producer', 'Prod_Form',
       'Contr_BudgetChannelNm', 'Contr_Su_OrganizationName',
       'Contr_SingleCustomerReason',
       'Customer_OrgNmS',  'Customer_RegNm',
       'Customer_OrgTypeName',
       'Customer_ShottypeLPY', 'Customer_typeLPY','LotSpec_MNN_InnE']]

im_Teders_df['data'] = df.apply(lambda x: '*'.join(x.dropna().astype(str).values), axis=1)
im_Teders_df['data'] = im_Teders_df['data'].str.replace("\r", " ")
im_Teders_df['data'] = im_Teders_df['data'].str.replace("\n", " ")
im_Teders_df['data'] = im_Teders_df['data'].str.replace("    ", " ")
im_Teders_df['data'] = im_Teders_df['data'].str.replace('"', '')
im_Teders_df['data'] = im_Teders_df['data'].str.lower()

punctuation_signs = list("?:!.,;")

for punct_sign in punctuation_signs:
    im_Teders_df['data'] = im_Teders_df['data'].str.replace(punct_sign, '')


im_Teders_df['data'] = im_Teders_df['data'].apply(preprocess_text)
category_codes = {
    'Hospital': 0,
    'COVID': 1,
    'Retail': 2,
    'Reimbursement': 3,
    'Program': 4
}

im_Teders_df['mark_Code'] = im_Teders_df[im_Teders_df['Mark'] != 'Service']['Mark']
im_Teders_df = im_Teders_df.replace({'mark_Code':category_codes})

df_with_mark_code = im_Teders_df[im_Teders_df['mark_Code'].notnull()]
df_for_predict = im_Teders_df
df_for_predict['mark_Code'] = ''
im_Teders_df['Mark'] = im_Teders_df[im_Teders_df['Mark'] != 'Service']['Mark']
df_for_training = df_with_mark_code[(df_with_mark_code['Contr_SignDate']>='2020-06-01')]
df_for_training.groupby(df_for_training['mark_Code']).count()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_for_training['data'],
                                                    df_for_training['mark_Code'],
                                                    test_size=.15,
                                                    random_state=8)
X = df_for_predict['data']

# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)

features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)
features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

In [None]:
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")


print(bigrams)

In [None]:
# GradientBoosting

n_estimators = [200, 800]
max_features = ['auto', 'sqrt']
max_depth = [10, 40]
max_depth.append(None)
min_samples_split = [10, 30, 50]
min_samples_leaf = [1, 2, 4]
learning_rate = [.1, .5]
subsample = [.5, 1.]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'learning_rate': learning_rate,
               'subsample': subsample}

pprint(random_grid)

In [None]:
# First create the base model to tune
gbc = GradientBoostingClassifier(random_state=8)

# Definition of the random search
random_search = RandomizedSearchCV(estimator=gbc,
                                   param_distributions=random_grid,
                                   n_iter=50,
                                   scoring='accuracy',
                                   cv=3,
                                   verbose=1,
                                   random_state=8)

# Fit the random search model
random_search.fit(features_train, labels_train)

print(random_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(random_search.best_score_)

In [None]:
# Create the parameter grid based on the results of random search
max_depth = [10]
max_features = ['sqrt']
min_samples_leaf = [2]
min_samples_split = [50]
n_estimators = [800]
learning_rate = [.1]
subsample = [1.]

param_grid = {
    'max_depth': max_depth,
    'max_features': max_features,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    'subsample': subsample

}

# Create a base model
gbc = GradientBoostingClassifier(random_state=8)

# Manually create the splits in CV in order to be able to fix a random_state (GridSearchCV doesn't have that argument)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gbc,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)

# Fit the grid search to the data
grid_search.fit(features_train, labels_train)

print("The best hyperparameters from Grid Search are:")
print(grid_search.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters is:")
print(grid_search.best_score_)

In [None]:
best_gbc = grid_search.best_estimator_
best_gbc.fit(features_train, labels_train)

In [None]:
gbc_pred = best_gbc.predict(features_test)

In [None]:
# Training accuracy
print("The training accuracy is: ")
print(accuracy_score(labels_train, best_gbc.predict(features_train)))

In [None]:
# Test accuracy
print("The test accuracy is: ")
print(accuracy_score(labels_test, gbc_pred))

In [None]:
# Classification report
print("Classification report")
print(classification_report(labels_test,gbc_pred))

In [None]:
# Confusion matrix
aux_df = df_for_training[['Mark', 'mark_Code']].drop_duplicates().sort_values('mark_Code')
conf_matrix = confusion_matrix(labels_test, gbc_pred)
plt.figure(figsize=(12.8,6))
sns.heatmap(conf_matrix,
            annot=True,
            xticklabels=aux_df['Mark'].values,
            yticklabels=aux_df['Mark'].values,
            cmap="Blues",
            fmt='g')

plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.title('Confusion matrix')
plt.show()

In [None]:
base_model = GradientBoostingClassifier(random_state = 8)
base_model.fit(features_train, labels_train)
accuracy_score(labels_test, base_model.predict(features_test))

In [None]:
best_gbc.fit(features_train, labels_train)
accuracy_score(labels_test, best_gbc.predict(features_test))

In [None]:
features = np.concatenate((features_train,features_test), axis=0)
labels = np.concatenate((labels_train,labels_test), axis=0)

In [None]:
plot_dim_red("TSNE",
             features=features,
             labels=labels,
             n_components=2)

In [None]:
X_test = tfidf.transform(X).toarray()
gbc_pred = best_gbc.predict(X_test)

In [None]:
print('GradientBoostingClassifier')
plot_dim_red("TSNE",
             features=X_test,
             labels=gbc_pred,
             n_components=2)

In [None]:
df_for_predict['mark_Code'] = gbc_pred
im_Teders_df = df_for_predict
codes_category = {
    0:'Hospital',
    1:'COVID',
    2:'Retail',
    3:'Reimbursement',
    4:'Program'
}

im_Teders_df = im_Teders_df.replace({'mark_Code':codes_category})
im_Teders_df.loc[((im_Teders_df['mark_Code'].isin(['Program','Reimbursement','COVID']))&\
                 (im_Teders_df['mark_Code'] != 'Варфарин')), 'DoT_Price'] = 52.2
im_Teders_df.loc[((im_Teders_df['mark_Code'].isin(['Hospital','Retail']))&\
                 (im_Teders_df['mark_Code'] != 'Варфарин')), 'DoT_Price'] = 68.7
im_Teders_df['Summa_with_sale_PRC'] = im_Teders_df['DoT_Price']*im_Teders_df['Prod_DOT']
im_tenders_allmarks_df = im_Teders_df 

# Write recipe outputs
im_tenders_allmarks = dataiku.Dataset("IM_tenders_allmarks")
im_tenders_allmarks.write_with_schema(im_tenders_allmarks_df)