# import

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dill as pickle

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform

In [2]:
data = pd.read_csv("dataset_for_classification.csv")

In [3]:
from ast import literal_eval
data.scaler = data.scaler.apply(literal_eval)

In [4]:
for i in range(len(data)):
    data.scaler.iloc[i] = np.asarray(data.scaler.iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [5]:
data

Unnamed: 0,tokens,_id,vector,labels,scaler
0,"['disasters', 'north', 'china', 'plain', 'soil...",53e99784b7602d9701f3e133,[[ 2.68383441e-03]\n [ 1.61278725e-03]\n [ 7.1...,6,"[0.4734036922454834, 0.45486414432525635, 0.48..."
1,"['variations', 'deep', 'submicron', 'shift', '...",53e99784b7602d9701f3e15d,[[ 3.46962526e-03]\n [ 3.37738288e-03]\n [ 1.0...,0,"[0.538163423538208, 0.6303530931472778, 0.6656..."
2,"['constructing', 'cope', 'dynamically', 'chang...",53e99784b7602d9701f3eaf2,[[ 0.0031892 ]\n [ 0.00229762]\n [ 0.01008034]...,2,"[0.5150524973869324, 0.5229702591896057, 0.626..."
3,"['welcome', 'proceedings', 'foreword']",53e99784b7602d9701f3f8c1,[[ 2.34201993e-03]\n [ 1.24823477e-03]\n [ 4.3...,5,"[0.4452335834503174, 0.4186094403266907, 0.347..."
4,"['biomolecular', 'molecular', 'biology', 'cont...",53e99784b7602d9701f3f8c2,[[ 3.22724599e-03]\n [ 2.78100441e-03]\n [ 9.0...,4,"[0.5181881189346313, 0.5710432529449463, 0.576..."
...,...,...,...,...,...
17949,"['witnessed', 'explosive', 'exciting', 'growth...",53e99808b7602d970201b271,[[ 3.89132742e-03]\n [ 2.55690538e-03]\n [ 8.5...,4,"[0.5729172825813293, 0.5487566590309143, 0.551..."
17950,"['reducing', 'ambiguity', 'multimodal', 'conta...",53e99808b7602d970201b980,[[ 2.74068583e-03]\n [ 2.09732424e-03]\n [ 8.6...,4,"[0.47808900475502014, 0.5030513405799866, 0.55..."
17951,"['deal', 'czech', 'mwe', 'containing', 'moment...",53e99808b7602d970201b981,[[ 2.86594988e-03]\n [ 1.62041571e-03]\n [ 6.7...,6,"[0.48841243982315063, 0.4556227922439575, 0.46..."
17952,"['interested', 'layer', 'completely', 'portabl...",53e99808b7602d970201b982,[[ 2.53269635e-03]\n [ 2.72065308e-03]\n [ 7.8...,1,"[0.4609478712081909, 0.5650413632392883, 0.520..."


In [6]:
def random_search_lr(X_train: pd.DataFrame, y_train: np.ndarray, n_jobs: int = -1, verbose: int  = 2):
    logreg = LogisticRegression(multi_class='multinomial')
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    space = {
            "C": loguniform(1e-3, 10),
            "penalty": ['none', 'l1', 'l2', 'elasticnet'],
            "class_weight": ["balanced", 'none'],
            "solver": ["lbfgs", "sag", "saga", "newton-cg"]
            }
    search = RandomizedSearchCV(logreg, space, n_iter=100, n_jobs=n_jobs, cv=cv, verbose=verbose, random_state=1, refit='f1_weighted', scoring='f1_weighted')
    search.fit(X_train, y_train)
    print(f"best score: {search.best_score_}")
    print(f"best params: {search.best_params_}")
    return search.best_params_

In [7]:
def fit_lr_random(X_train: pd.DataFrame, y_train: np.ndarray, param_random: dict, n_jobs: int = -1):
    lr = LogisticRegression(**param_random, n_jobs=n_jobs, random_state=1)
    lr.fit(X_train, y_train)
    return lr

In [8]:
def fit_lr(X_train: pd.DataFrame, y_train: np.ndarray, n_jobs: int = -1):
    lr = LogisticRegression(n_jobs=n_jobs, random_state=1)
    lr.fit(X_train, y_train)
    return lr

In [9]:
def convert_to_array(X):
    l = [X[i] for i in range(X.shape[0])]
    return np.vstack(l)

In [10]:
def save_model(model, name_model: str):
#     logger.info(f"Save model {name_model}")
    pickle.dump(model, open(f"models/{name_model}.pkl", "wb"))

In [11]:
def predict(clf, X: pd.DataFrame):
    return clf.predict(X), clf.predict_proba(X)

In [34]:
def score(y: np.ndarray, y_pred: np.ndarray):
    print("accuracy: ", accuracy_score(y, y_pred))
    print("f1_macro: ", f1_score(y, y_pred, average='macro'))
    print("f1_micro: ", f1_score(y, y_pred, average='micro'))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data.scaler.values, data.labels.values, test_size=0.3, random_state=42)
X_train = convert_to_array(X_train)
# print(random_search_lr(X_train, y_train))
# rand_params = random_search_lr(X_train, y_train)
# lr_random = fit_lr_random(X_train, y_train, rand_params)
lr = fit_lr(X_train, y_train)

save_model(lr, "lr_optimal")

In [35]:
X_test = convert_to_array(X_test)
y_pred, y_pred_proba = predict(lr, X_test)
score(y_test, y_pred)

accuracy:  0.9606459996287359
f1_macro:  0.9272407230456963
f1_micro:  0.9606459996287359
