In [12]:
#!pip3 install pyforest # for automatic imports
#!pip3 install pandarallel # for parallel processing in pandas
#!pip3 install somajo
#!pip3 install dask
#!python3 -m install spacy download de_core_news_md
#!pip3 install dask[dataframe]
#!python3 -m spacy download de_core_news_sm
#!pip3 install gensim
#!pip3 install spacy

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dask.dataframe as dd # extension to pandas for large files
import dask.array as da

import random
from pathlib import Path

import spacy
import spacy
from spacy.lang.de.examples import sentences 

nlp = spacy.load("de_core_news_sm",disable=['tagger', 'parser', 'ner', 'lemmatizer', 'textcat'])

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelBinarizer

# Gensim for word vectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

from somajo import SoMaJo # German tokenizer
tokenizer = SoMaJo("de_CMC")
# Declare paths
input_path = Path('/Users/christinasanchez/Downloads/court/')
output_path = Path('/Users/christinasanchez/Downloads/court/')

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [1]:
 df = pd.read_csv(
             '/Users/christinasanchez/Documents/Cloudera/Public/MinisterioJusticia/court/sample_train.csv',
             header=0)
    

<IPython.core.display.Javascript object>

In [6]:
df.head()

Unnamed: 0,court_id,text
0,ZH_OG_001,Urteil Gutheissung/Abweisung Beschwerde Oberge...
1,ZH_OG_999,Organisationsmangel GmbH: Auflösung Handelsge...
2,ZH_OG_001,Urteil/Beschluss Berufung/Beschwerde (Grundfor...
3,ZH_OG_002,Beschluss Obergericht des Kantons Zürich III....
4,ZH_OG_004,Beschluss Obergericht des Kantons Zürich Prä...


In [5]:
df['text'] = np.where(df['html_clean'].notna(), df['html_clean'], df['pdf_clean']) # Combine columns into one easy to use text column
df = df.drop(['Unnamed: 0','html_clean', 'pdf_clean'], axis='columns') # drop old columns,

<IPython.core.display.Javascript object>

In [7]:
output_file='/Users/christinasanchez/Documents/Cloudera/Public/MinisterioJusticia/court/vizDataCourt.csv'
df.to_csv(output_file, index=False)

In [14]:
# since the entire dataset is very large: just sample a small random subset which we can explore much more easily

def sample_subset(input_file, output_file, p = 0.01):
    # keep the header, then take only 1% of lines
    # if random from [0,1] interval is greater than 0.01 the row will be skipped
    df = pd.read_csv(
             input_file,
             header=0, 
             skiprows=lambda i: i>0 and random.random() > p
    )
    df.to_csv(output_file, index=False)

sample_subset(input_path/'train.csv', output_path / 'sample_train.csv', p=0.01)
sample_subset(input_path/'val.csv', output_path / 'sample_val.csv', p=0.01)


#!wget https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt

def load_glove_model(file_path, kind='pydict'):
    print("Loading Glove Model")
    if kind=='pydict': # faster to load, only provides embedding lookup (no similarity)
        with open(file_path,'r') as f:
            gloveModel = {}
            for line in f:
                splitLines = line.split()
                word = splitLines[0]
                wordEmbedding = np.array([float(value) for value in splitLines[1:]])
                gloveModel[word] = wordEmbedding
    elif kind=='gensim': # takes longer to load. offers more functionalities
        # converting the glove vectors to word2vec vector format 
        word_vectors_file = output_path / "gensim_glove_vectors.txt"
        glove2word2vec(glove_input_file=glove_input_file, word2vec_output_file=word_vectors_file)
        # loading the word vectors into gensim
        glove_model = KeyedVectors.load_word2vec_format(word_vectors_file, binary=False)
    print(len(gloveModel)," words loaded!")
    return gloveModel

glove_input_file = "vectors.txt"
glove_model = load_glove_model(glove_input_file)


def tokenize(text):
    """tokenize with somajo. Remove all non-word characters"""
    sentences = tokenizer.tokenize_text([text])
    return [token.text for token in list(sentences)[0] if token.token_class == 'regular'] # remove non-words (emoticons, punctuation, etc.)

def get_token_emb(token):
    try:
        return glove_model[token]
    except KeyError as e: # out of vocabulary
        return False # just ignore OOV words

def calc_doc_emb(text, kind='spacy'):
    """
    calculate document embeddings by 
        1. tokenizing with somajo
        2. retrieving glove embeddings
        3. taking the element-wise mean of token embeddings
    """
    if kind=='glove': # faster
        tokens = tokenize(text)
        embeddings = []
        for token in tokens:
            emb = get_token_emb(token)
            if emb is not False: # if not OOV
                embeddings.append(emb)
        return np.mean(embeddings, axis=0)
    elif kind=='spacy': # easier to do and takes less than half the time
        return nlp(text).vector

--2021-10-30 18:17:33--  https://int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com/vectors.txt
Resolving int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com (int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com)... 52.219.140.85
Connecting to int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com (int-emb-glove-de-wiki.s3.eu-central-1.amazonaws.com)|52.219.140.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3744610526 (3,5G) [text/plain]
Saving to: ‘vectors.txt’


2021-10-30 18:25:12 (7,78 MB/s) - ‘vectors.txt’ saved [3744610526/3744610526]

Loading Glove Model
1309281  words loaded!


In [15]:
def generate_features(input_path, output_path, split, nrows=None):
    """Load files in chunks because they are too large to fit in memory. You might have to adjust the chunksize depending on your RAM.
    If you specify the parameter nrows: it only processes the first nrows entries of the csv file. This is useful for fast exploration of the data."""
    print("Started computation")
    
    csv_file_path = input_path / f'{split}.csv'
    #dtype={'court_id': 'category', 'html_clean': 'string', 'pdf_clean': 'string'}
    reader = pd.read_csv(csv_file_path, chunksize=5000, nrows=nrows) 
    print(f"Reading data from {csv_file_path}")
    for index, df in enumerate(reader):
        print(f"Processing chunk {index}")
        print(df)
        print(df.memory_usage(deep=True))

        df['text'] = np.where(df['html_clean'].notna(), df['html_clean'], df['pdf_clean']) # Combine columns into one easy to use text column
        df = df.drop(['html_clean', 'pdf_clean'], axis='columns') # drop old columns
        print('Combined columns into one easy to use text column')
        print(df)
        print(df.memory_usage(deep=True))

        df['vector'] = df['text'].parallel_apply(lambda x: calc_doc_emb(x, kind='spacy'))
        #df['vector'] = df['text'].apply(lambda x: calc_doc_emb(x, kind='spacy'))
        df = df.drop(['text'], axis='columns') # now we don't need the text column anymore
        print(f"Calculated the document embeddings (average token embeddings) for each text with spacy")
        print(df)
        print(df.memory_usage(deep=True))
            
        print(df.dtypes)

        df.to_pickle(output_path / f'{split}_{index}.pkl')
        print("Generated features and saved them to pickle file.")
      
      
generate_features(output_path, output_path, 'sample_train')
generate_features(output_path, output_path, 'sample_val')

Started computation
Reading data from /Users/christinasanchez/Downloads/court/sample_train.csv
Processing chunk 0
      Unnamed: 0   court_id  \
0           1864  ZH_OG_001   
1           2149  ZH_OG_999   
2           2405  ZH_OG_001   
3           1344  ZH_OG_002   
4           2127  ZH_OG_004   
...          ...        ...   
2178        2045  ZH_VG_001   
2179        1185  ZH_VG_001   
2180        2470  ZH_VG_001   
2181         937  ZH_VG_001   
2182          30  ZH_VG_001   

                                             html_clean  \
0                                                   NaN   
1                                                   NaN   
2                                                   NaN   
3                                                   NaN   
4                                                   NaN   
...                                                 ...   
2178  Standard Suche | Erweiterte Suche | Hilfe Druc...   
2179  Standard Suche | Erweiterte Suche |

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=182), Label(value='0 / 182'))), HB…

Calculated the document embeddings (average token embeddings) for each text with spacy
      Unnamed: 0   court_id                                             vector
0           1864  ZH_OG_001  [0.45827118, 0.2816123, 0.54730785, -0.4775813...
1           2149  ZH_OG_999  [0.76452434, 0.22215457, 0.72017765, -0.763184...
2           2405  ZH_OG_001  [0.5883306, 0.28356117, 0.7010473, -0.6681037,...
3           1344  ZH_OG_002  [0.7846166, 0.21266416, 0.75216174, -0.9644105...
4           2127  ZH_OG_004  [0.72875327, 0.24049571, 0.6774721, -0.6587245...
...          ...        ...                                                ...
2178        2045  ZH_VG_001  [0.5881519, 0.268213, 0.59488183, -0.63250065,...
2179        1185  ZH_VG_001  [0.4848262, 0.24296758, 0.5766961, -0.5481863,...
2180        2470  ZH_VG_001  [0.52644277, 0.22152312, 0.46814537, -0.592404...
2181         937  ZH_VG_001  [0.43598703, 0.31691647, 0.5527676, -0.5516175...
2182          30  ZH_VG_001  [0.7374618, 0.2

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27), Label(value='0 / 27'))), HBox…

Calculated the document embeddings (average token embeddings) for each text with spacy
     Unnamed: 0   court_id                                             vector
0          1636  ZH_OG_001  [0.7186135, 0.35360232, 0.75004244, -0.7550744...
1          1157  ZH_OG_001  [0.65169185, 0.059517905, 0.75396776, -0.59514...
2          2390  ZH_OG_005  [0.73152983, 0.20796597, 0.68253386, -0.887161...
3          1297  ZH_OG_001  [0.56677413, 0.0873941, 0.7364303, -0.4972157,...
4           766  ZH_OG_005  [0.47398534, 0.11673667, 0.53369254, -0.469188...
..          ...        ...                                                ...
309         160  ZH_VG_001  [0.57516384, 0.31631845, 0.6069203, -0.5810599...
310        1316  ZH_VG_001  [0.578471, 0.30918786, 0.60536075, -0.61299664...
311        1927  ZH_VG_001  [0.59990567, 0.21445571, 0.73326296, -0.686135...
312         137  ZH_VG_001  [0.4124228, 0.11086403, 0.5604477, -0.5717464,...
313         479  ZH_VG_001  [0.5567578, 0.3148673, 0.56

In [17]:
sample_train = pd.read_pickle(output_path / 'sample_train_0.pkl')
sample_val = pd.read_pickle(output_path / 'sample_val_0.pkl')
sample_train, sample_val

(      Unnamed: 0   court_id                                             vector
 0           1864  ZH_OG_001  [0.45827118, 0.2816123, 0.54730785, -0.4775813...
 1           2149  ZH_OG_999  [0.76452434, 0.22215457, 0.72017765, -0.763184...
 2           2405  ZH_OG_001  [0.5883306, 0.28356117, 0.7010473, -0.6681037,...
 3           1344  ZH_OG_002  [0.7846166, 0.21266416, 0.75216174, -0.9644105...
 4           2127  ZH_OG_004  [0.72875327, 0.24049571, 0.6774721, -0.6587245...
 ...          ...        ...                                                ...
 2178        2045  ZH_VG_001  [0.5881519, 0.268213, 0.59488183, -0.63250065,...
 2179        1185  ZH_VG_001  [0.4848262, 0.24296758, 0.5766961, -0.5481863,...
 2180        2470  ZH_VG_001  [0.52644277, 0.22152312, 0.46814537, -0.592404...
 2181         937  ZH_VG_001  [0.43598703, 0.31691647, 0.5527676, -0.5516175...
 2182          30  ZH_VG_001  [0.7374618, 0.26846302, 0.59275675, -0.6844454...
 
 [2183 rows x 3 columns],
      Unname

In [18]:
sample_train['court_id'].unique(), sample_val['court_id'].unique()

(array(['ZH_OG_001', 'ZH_OG_999', 'ZH_OG_002', 'ZH_OG_004', 'ZH_OG_005',
        'TG_OG_001', 'CH_BSTG_001', 'BS_APG_001', 'BS_SVG_001',
        'SO_VSG_001', 'AR_KG_005', 'ZG_VG_004', 'SH_OG_001', 'AG_SRG_001',
        'AG_VG_001', 'AR_OG_004', 'GL_VG_001', 'AG_RGAR_001', 'AR_OG_003',
        'AG_OG_005', 'AG_HG_001', 'AR_OG_002', 'GL_OG_001', 'AG_PRG_001',
        'LU_KG_999', 'GR_VG_003', 'LU_KG_001', 'LU_KG_003', 'LU_RR_001',
        'GR_VG_005', 'GR_KG_007', 'GR_VG_002', 'GR_VG_004', 'GR_KG_006',
        'GR_VG_001', 'GR_KG_001', 'GR_KG_003', 'GR_KG_999', 'GR_VG_006',
        'GR_KG_005', 'GR_KG_004', 'GR_KG_002', 'BL_KG_001', 'BL_SG_001',
        'CH_BGE_005', 'CH_BGE_003', 'CH_BGE_002', 'CH_BGE_004',
        'CH_BGE_007', 'CH_BGE_001', 'CH_BGE_006', 'SG_VSG_001',
        'SG_VG_001', 'SG_VWEK_001', 'BE_SRK_001', 'SG_HG_001', 'SG_KG_001',
        'BE_OG_001', 'SG_KG_002', 'BE_OG_008', 'BE_OG_005', 'OW_VB_001',
        'VS_TC_001', 'OW_OG_001', 'OW_VG_001', 'UR_REB_001', 'CH_BVGE_

In [19]:
def get_X_and_y(df):
    X = df['vector'].to_numpy() # Get the features from the pandas dataframe
    X = np.stack(X) # This is necessary so X has the right shape! see: https://stackoverflow.com/questions/50971123/converty-numpy-array-of-arrays-to-2d-array

    y = df['court_id'].to_numpy() # Get the labels from the pandas dataframe
    return X, y

X_train, y_train = get_X_and_y(sample_train)
X_val, y_val = get_X_and_y(sample_val)

In [20]:
le = LabelBinarizer()
le.fit(y_train) # fit on y_train because these should contain all classes
list(le.classes_)

['AG_HG_001',
 'AG_OG_005',
 'AG_PRG_001',
 'AG_RGAR_001',
 'AG_SRG_001',
 'AG_VG_001',
 'AR_KG_005',
 'AR_OG_002',
 'AR_OG_003',
 'AR_OG_004',
 'BE_OG_001',
 'BE_OG_005',
 'BE_OG_008',
 'BE_SRK_001',
 'BE_VG_001',
 'BL_KG_001',
 'BL_SG_001',
 'BS_APG_001',
 'BS_SVG_001',
 'CH_BGE_001',
 'CH_BGE_002',
 'CH_BGE_003',
 'CH_BGE_004',
 'CH_BGE_005',
 'CH_BGE_006',
 'CH_BGE_007',
 'CH_BGer_001',
 'CH_BGer_002',
 'CH_BGer_004',
 'CH_BGer_005',
 'CH_BGer_008',
 'CH_BGer_009',
 'CH_BGer_010',
 'CH_BGer_011',
 'CH_BGer_016',
 'CH_BSTG_001',
 'CH_BVGE_001',
 'FR_TC_001',
 'FR_TC_004',
 'FR_TC_005',
 'FR_TC_006',
 'FR_TC_007',
 'FR_TC_011',
 'GL_OG_001',
 'GL_VG_001',
 'GR_KG_001',
 'GR_KG_002',
 'GR_KG_003',
 'GR_KG_004',
 'GR_KG_005',
 'GR_KG_006',
 'GR_KG_007',
 'GR_KG_999',
 'GR_VG_001',
 'GR_VG_002',
 'GR_VG_003',
 'GR_VG_004',
 'GR_VG_005',
 'GR_VG_006',
 'LU_KG_001',
 'LU_KG_003',
 'LU_KG_999',
 'LU_RR_001',
 'OW_OG_001',
 'OW_VB_001',
 'OW_VG_001',
 'SG_HG_001',
 'SG_KG_001',
 'SG_KG_002'

In [21]:
y_train = le.transform(y_train)
y_val = le.transform(y_val)

In [36]:
mlp = MLPClassifier(learning_rate='adaptive',max_iter=10000,early_stopping=False, random_state=1, verbose=True)
mlp = mlp.fit(X_train, y_train)
mlp

Iteration 1, loss = 58.00735868
Iteration 2, loss = 40.49199608
Iteration 3, loss = 20.96320258
Iteration 4, loss = 9.43284240
Iteration 5, loss = 5.80721995
Iteration 6, loss = 4.90398219
Iteration 7, loss = 4.60426252
Iteration 8, loss = 4.45369145
Iteration 9, loss = 4.36376417
Iteration 10, loss = 4.30317954
Iteration 11, loss = 4.25584969
Iteration 12, loss = 4.21706055
Iteration 13, loss = 4.18188182
Iteration 14, loss = 4.14809049
Iteration 15, loss = 4.11289536
Iteration 16, loss = 4.07605976
Iteration 17, loss = 4.03470005
Iteration 18, loss = 3.99193432
Iteration 19, loss = 3.94946817
Iteration 20, loss = 3.90627513
Iteration 21, loss = 3.86630393
Iteration 22, loss = 3.82485156
Iteration 23, loss = 3.78621919
Iteration 24, loss = 3.74770770
Iteration 25, loss = 3.71236517
Iteration 26, loss = 3.67702440
Iteration 27, loss = 3.64102490
Iteration 28, loss = 3.60712579
Iteration 29, loss = 3.57526114
Iteration 30, loss = 3.54347842
Iteration 31, loss = 3.51388702
Iteration 32, 

Iteration 265, loss = 1.84862975
Iteration 266, loss = 1.84492317
Iteration 267, loss = 1.84273560
Iteration 268, loss = 1.83840382
Iteration 269, loss = 1.83420828
Iteration 270, loss = 1.83039236
Iteration 271, loss = 1.82539137
Iteration 272, loss = 1.82501911
Iteration 273, loss = 1.81875783
Iteration 274, loss = 1.81733072
Iteration 275, loss = 1.81287324
Iteration 276, loss = 1.80776532
Iteration 277, loss = 1.80379559
Iteration 278, loss = 1.80062189
Iteration 279, loss = 1.79829608
Iteration 280, loss = 1.79502634
Iteration 281, loss = 1.79300358
Iteration 282, loss = 1.78796569
Iteration 283, loss = 1.78427981
Iteration 284, loss = 1.77970030
Iteration 285, loss = 1.77822730
Iteration 286, loss = 1.77496748
Iteration 287, loss = 1.77113820
Iteration 288, loss = 1.76756720
Iteration 289, loss = 1.76428897
Iteration 290, loss = 1.75755283
Iteration 291, loss = 1.75679744
Iteration 292, loss = 1.75364829
Iteration 293, loss = 1.74883952
Iteration 294, loss = 1.74886356
Iteration 

Iteration 525, loss = 1.13776148
Iteration 526, loss = 1.13670957
Iteration 527, loss = 1.13404934
Iteration 528, loss = 1.13077251
Iteration 529, loss = 1.12747131
Iteration 530, loss = 1.12644147
Iteration 531, loss = 1.12693622
Iteration 532, loss = 1.12483894
Iteration 533, loss = 1.11938903
Iteration 534, loss = 1.11971743
Iteration 535, loss = 1.11675457
Iteration 536, loss = 1.11472704
Iteration 537, loss = 1.11434144
Iteration 538, loss = 1.11191298
Iteration 539, loss = 1.11059506
Iteration 540, loss = 1.10740733
Iteration 541, loss = 1.10509867
Iteration 542, loss = 1.10396659
Iteration 543, loss = 1.10300446
Iteration 544, loss = 1.10089454
Iteration 545, loss = 1.09876539
Iteration 546, loss = 1.09544386
Iteration 547, loss = 1.09456133
Iteration 548, loss = 1.09187542
Iteration 549, loss = 1.09078905
Iteration 550, loss = 1.09266500
Iteration 551, loss = 1.08697848
Iteration 552, loss = 1.08695959
Iteration 553, loss = 1.08269660
Iteration 554, loss = 1.08241197
Iteration 

Iteration 780, loss = 0.75677216
Iteration 781, loss = 0.75639698
Iteration 782, loss = 0.75581149
Iteration 783, loss = 0.75287227
Iteration 784, loss = 0.75531244
Iteration 785, loss = 0.75287662
Iteration 786, loss = 0.75200753
Iteration 787, loss = 0.74787417
Iteration 788, loss = 0.74690540
Iteration 789, loss = 0.74493682
Iteration 790, loss = 0.74592018
Iteration 791, loss = 0.74433877
Iteration 792, loss = 0.74120595
Iteration 793, loss = 0.74105137
Iteration 794, loss = 0.74160539
Iteration 795, loss = 0.74182909
Iteration 796, loss = 0.73947494
Iteration 797, loss = 0.73903909
Iteration 798, loss = 0.73625813
Iteration 799, loss = 0.73745677
Iteration 800, loss = 0.73464472
Iteration 801, loss = 0.73533802
Iteration 802, loss = 0.73257344
Iteration 803, loss = 0.73206756
Iteration 804, loss = 0.73433346
Iteration 805, loss = 0.73170138
Iteration 806, loss = 0.72928098
Iteration 807, loss = 0.72958119
Iteration 808, loss = 0.72423621
Iteration 809, loss = 0.72987891
Iteration 

Iteration 1035, loss = 0.52644067
Iteration 1036, loss = 0.52681130
Iteration 1037, loss = 0.52593687
Iteration 1038, loss = 0.52453865
Iteration 1039, loss = 0.52487632
Iteration 1040, loss = 0.52441323
Iteration 1041, loss = 0.52365207
Iteration 1042, loss = 0.52194034
Iteration 1043, loss = 0.52057260
Iteration 1044, loss = 0.51961914
Iteration 1045, loss = 0.52092944
Iteration 1046, loss = 0.51917686
Iteration 1047, loss = 0.51860345
Iteration 1048, loss = 0.51637489
Iteration 1049, loss = 0.51751086
Iteration 1050, loss = 0.51694367
Iteration 1051, loss = 0.51720341
Iteration 1052, loss = 0.51576002
Iteration 1053, loss = 0.51567558
Iteration 1054, loss = 0.51233970
Iteration 1055, loss = 0.51175091
Iteration 1056, loss = 0.51340452
Iteration 1057, loss = 0.51257447
Iteration 1058, loss = 0.51416038
Iteration 1059, loss = 0.51139281
Iteration 1060, loss = 0.51003458
Iteration 1061, loss = 0.51027262
Iteration 1062, loss = 0.50811429
Iteration 1063, loss = 0.50747103
Iteration 1064

Iteration 1285, loss = 0.38425686
Iteration 1286, loss = 0.38160810
Iteration 1287, loss = 0.37801873
Iteration 1288, loss = 0.37851311
Iteration 1289, loss = 0.37788361
Iteration 1290, loss = 0.37643153
Iteration 1291, loss = 0.37746286
Iteration 1292, loss = 0.37765970
Iteration 1293, loss = 0.37597228
Iteration 1294, loss = 0.37397873
Iteration 1295, loss = 0.37458308
Iteration 1296, loss = 0.37442648
Iteration 1297, loss = 0.37374872
Iteration 1298, loss = 0.37235899
Iteration 1299, loss = 0.37203177
Iteration 1300, loss = 0.37177592
Iteration 1301, loss = 0.37351906
Iteration 1302, loss = 0.37116414
Iteration 1303, loss = 0.36902422
Iteration 1304, loss = 0.37104132
Iteration 1305, loss = 0.37013454
Iteration 1306, loss = 0.36906820
Iteration 1307, loss = 0.37397155
Iteration 1308, loss = 0.37024460
Iteration 1309, loss = 0.36812406
Iteration 1310, loss = 0.36778147
Iteration 1311, loss = 0.37026030
Iteration 1312, loss = 0.36844448
Iteration 1313, loss = 0.36615152
Iteration 1314

Iteration 1535, loss = 0.28044093
Iteration 1536, loss = 0.28081348
Iteration 1537, loss = 0.28284947
Iteration 1538, loss = 0.28128147
Iteration 1539, loss = 0.27796634
Iteration 1540, loss = 0.27979006
Iteration 1541, loss = 0.27858524
Iteration 1542, loss = 0.27848079
Iteration 1543, loss = 0.27910315
Iteration 1544, loss = 0.27786434
Iteration 1545, loss = 0.27782255
Iteration 1546, loss = 0.27672039
Iteration 1547, loss = 0.27690756
Iteration 1548, loss = 0.27880188
Iteration 1549, loss = 0.27596026
Iteration 1550, loss = 0.27515797
Iteration 1551, loss = 0.27285032
Iteration 1552, loss = 0.27325215
Iteration 1553, loss = 0.27414923
Iteration 1554, loss = 0.27543244
Iteration 1555, loss = 0.27523178
Iteration 1556, loss = 0.27317009
Iteration 1557, loss = 0.27264372
Iteration 1558, loss = 0.27391619
Iteration 1559, loss = 0.27096422
Iteration 1560, loss = 0.27204007
Iteration 1561, loss = 0.27288616
Iteration 1562, loss = 0.27082121
Iteration 1563, loss = 0.27192928
Iteration 1564

Iteration 1777, loss = 0.21084102
Iteration 1778, loss = 0.21014643
Iteration 1779, loss = 0.20804945
Iteration 1780, loss = 0.20876664
Iteration 1781, loss = 0.20837879
Iteration 1782, loss = 0.20790456
Iteration 1783, loss = 0.20842724
Iteration 1784, loss = 0.20731378
Iteration 1785, loss = 0.20785444
Iteration 1786, loss = 0.20914704
Iteration 1787, loss = 0.20535566
Iteration 1788, loss = 0.20763057
Iteration 1789, loss = 0.20532388
Iteration 1790, loss = 0.20679624
Iteration 1791, loss = 0.20996934
Iteration 1792, loss = 0.20860897
Iteration 1793, loss = 0.20581439
Iteration 1794, loss = 0.20546625
Iteration 1795, loss = 0.20490807
Iteration 1796, loss = 0.20460780
Iteration 1797, loss = 0.20264082
Iteration 1798, loss = 0.20409483
Iteration 1799, loss = 0.20517488
Iteration 1800, loss = 0.20309480
Iteration 1801, loss = 0.20399849
Iteration 1802, loss = 0.20734189
Iteration 1803, loss = 0.20260084
Iteration 1804, loss = 0.20180408
Iteration 1805, loss = 0.20196504
Iteration 1806

MLPClassifier(learning_rate='adaptive', max_iter=10000, random_state=1,
              verbose=True)

In [37]:
evaluation = mlp.score(X_val, y_val)
evaluation

0.5191082802547771

In [39]:
print(le.inverse_transform(mlp.predict(X_val[21:22])))
print(le.inverse_transform(y_val[21:22]))

['ZH_OG_001']
['ZH_OG_001']


In [44]:
X_val

array([[ 0.7186135 ,  0.35360232,  0.75004244, ..., -0.3740786 ,
         0.31135887,  0.07832725],
       [ 0.65169185,  0.05951791,  0.75396776, ..., -0.4885568 ,
         0.26252186,  0.14312756],
       [ 0.73152983,  0.20796597,  0.68253386, ..., -0.6015135 ,
         0.36704814,  0.23357576],
       ...,
       [ 0.59990567,  0.21445571,  0.73326296, ..., -0.432823  ,
         0.07673719,  0.02185293],
       [ 0.4124228 ,  0.11086403,  0.5604477 , ..., -0.3952022 ,
         0.19104524,  0.1784601 ],
       [ 0.5567578 ,  0.3148673 ,  0.5639209 , ..., -0.30253956,
         0.07803287,  0.3014823 ]], dtype=float32)

In [27]:
X_val[0]

array([ 0.7186135 ,  0.35360232,  0.75004244, -0.75507444,  0.5146347 ,
       -0.05929965, -0.48733556,  0.716396  ,  0.5585358 ,  0.03363845,
       -0.35495117,  0.52262396, -0.20538792, -0.29927608,  0.6596312 ,
       -0.16774978,  0.34819102,  0.10610703, -0.50939536, -0.8556565 ,
        0.19907139,  0.19906104, -0.08626774,  0.22503075, -0.795866  ,
        0.20554847,  0.12883528, -0.43877086,  0.64598906,  0.20402071,
       -0.17360894,  0.06908178,  0.10764643, -0.0537234 ,  1.004636  ,
       -0.07894274,  0.6849739 ,  0.4513347 ,  0.4727789 ,  0.4769564 ,
       -0.8889142 ,  0.13964882, -0.410465  , -0.36611572,  0.2543563 ,
       -0.8581458 ,  0.09685345,  0.1478914 ,  0.46103993, -0.5249954 ,
        0.09527186, -0.6426263 , -0.79466105, -0.22370137, -0.1492239 ,
        0.3134825 ,  0.13484208,  0.2063926 , -0.62592036,  0.24027076,
       -0.23078752, -0.4737579 ,  0.22236241, -1.1879982 , -0.10915256,
        0.727845  , -0.5935705 , -0.36906448,  0.5339456 ,  0.81

In [1]:
import pickle
with open('le.pickle', 'wb') as f:
    pickle.dump(le, f)

NameError: name 'le' is not defined

In [42]:
import pickle
with open('mlp.pickle', 'wb') as f:
    pickle.dump(mlp, f)