### Перенесем данные в pandas DF для удобства работы

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import f1_score

In [2]:
train_dataset = pd.read_csv('../Train-dataset.csv')
train_dataset.head()

Unnamed: 0,WELL,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_NAME,LITH_CODE
0,Well-1,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,Marly sandstone,1200
1,Well-1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,Marly sandstone,1200
2,Well-1,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,Marly sandstone,1200
3,Well-1,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,Sandy marl,1300
4,Well-1,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,Sandy marl,1300


In [3]:
validation_dataset = pd.read_csv('../Validation-dataset.csv')
validation_dataset.head()

Unnamed: 0,WELL,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT
0,Well-X,2.9956,0.0061,897.065,92.7094,4.51768,0.36366,2.2106,Continental
1,Well-X,2.9956,0.0061,897.165,93.9007,4.30683,0.36951,2.2036,Continental
2,Well-X,2.9956,0.0061,897.265,94.8446,3.98902,0.37914,2.1969,Continental
3,Well-X,2.9956,0.0061,897.365,95.6321,3.67028,0.3902,2.1923,Continental
4,Well-X,2.9956,0.0061,897.465,96.47,3.63431,0.40098,2.1926,Continental


### Очевидно, что название колонн не информативный признак т.к. никак не относится к значениям приборов или географическим положениям, тем более на валидационном ДС он совсем другой. Уберем его из ДС. Аналогично рассуждая уберем и LITH_NAME

In [4]:
train_dataset = train_dataset.drop(['WELL', 'LITH_NAME'], axis=1)
train_dataset.head()

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_CODE
0,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,1200
1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,1200
2,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,1200
3,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,1300
4,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,1300


In [5]:
validation_dataset = validation_dataset.drop(['WELL'], axis=1)
validation_dataset.head()

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT
0,2.9956,0.0061,897.065,92.7094,4.51768,0.36366,2.2106,Continental
1,2.9956,0.0061,897.165,93.9007,4.30683,0.36951,2.2036,Continental
2,2.9956,0.0061,897.265,94.8446,3.98902,0.37914,2.1969,Continental
3,2.9956,0.0061,897.365,95.6321,3.67028,0.3902,2.1923,Continental
4,2.9956,0.0061,897.465,96.47,3.63431,0.40098,2.1926,Continental


###  Очевидно, что DEPOSITIONAL_ENVIRONMENT категориальный признак и его нужно закодировать. Т.к. диспозиции всего 3, то можно применить one-hot-кодирование при этом матрица всё еще не будет сильно разреженной

In [6]:
ohe = OneHotEncoder(sparse=False)
new_ohe_features = ohe.fit_transform(train_dataset.DEPOSITIONAL_ENVIRONMENT.values.reshape(-1,1))
tmp = pd.DataFrame(new_ohe_features, columns=['ENV' + 
                                             str(i) for i in range(new_ohe_features.shape[1])])
train_dataset = pd.concat([train_dataset, tmp], axis=1)
train_dataset = train_dataset.drop(['DEPOSITIONAL_ENVIRONMENT'], axis=1)
cols = train_dataset.columns.tolist()
cols = cols[0:7] + cols[8:11] + cols[7:8]
train_dataset = train_dataset[cols]
train_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
0,0.8179,2.9814,1602.0,83.9390,3.1660,0.25000,2.3440,0.0,1.0,0.0,1200
1,0.8179,2.9814,1602.1,84.1660,3.1350,0.24600,2.3520,0.0,1.0,0.0,1200
2,0.8179,2.9814,1602.2,85.0550,3.0890,0.24400,2.3520,0.0,1.0,0.0,1200
3,0.8179,2.9814,1602.3,86.3520,3.0420,0.24200,2.3550,0.0,1.0,0.0,1300
4,0.8179,2.9814,1602.4,87.6140,3.0030,0.24100,2.3690,0.0,1.0,0.0,1300
...,...,...,...,...,...,...,...,...,...,...,...
45744,3.0000,0.0000,2275.2,103.6016,1.0789,0.29558,2.3783,0.0,1.0,0.0,400
45745,3.0000,0.0000,2275.3,102.8472,1.0683,0.29264,2.3651,0.0,1.0,0.0,400
45746,3.0000,0.0000,2275.4,102.5699,1.0790,0.29425,2.3531,0.0,1.0,0.0,400
45747,3.0000,0.0000,2275.5,102.7901,1.1045,0.30096,2.3430,0.0,1.0,0.0,400


In [7]:
new_ohe_features = ohe.fit_transform(validation_dataset.DEPOSITIONAL_ENVIRONMENT.values.reshape(-1,1))
tmp = pd.DataFrame(new_ohe_features, columns=['ENV' + 
                                             str(i) for i in range(new_ohe_features.shape[1])])
validation_dataset = pd.concat([validation_dataset, tmp], axis=1)
validation_dataset = validation_dataset.drop(['DEPOSITIONAL_ENVIRONMENT'], axis=1)
validation_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2
0,2.9956,0.0061,897.065,92.7094,4.51768,0.36366,2.2106,1.0,0.0,0.0
1,2.9956,0.0061,897.165,93.9007,4.30683,0.36951,2.2036,1.0,0.0,0.0
2,2.9956,0.0061,897.265,94.8446,3.98902,0.37914,2.1969,1.0,0.0,0.0
3,2.9956,0.0061,897.365,95.6321,3.67028,0.39020,2.1923,1.0,0.0,0.0
4,2.9956,0.0061,897.465,96.4700,3.63431,0.40098,2.1926,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
28992,1.1475,2.6893,1717.800,108.7532,2.18830,0.31751,2.4094,0.0,1.0,0.0
28993,1.1475,2.6893,1717.900,108.0650,2.21620,0.31582,2.4125,0.0,1.0,0.0
28994,1.1475,2.6893,1718.000,108.0439,2.25220,0.31304,2.4150,0.0,1.0,0.0
28995,1.1475,2.6893,1718.100,108.6799,2.30580,0.30963,2.4175,0.0,1.0,0.0


### С помощью кода со Stackoverflow проверим все ли признаки у нас присутвуют и нет ли у нас отсутсвующих значений
##### https://stackoverflow.com/questions/26266362/how-to-count-the-nan-values-in-a-column-in-pandas-dataframe/39734251#39734251

In [8]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns
missing_values_table(train_dataset)

Your selected dataframe has 11 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


### Посмотрим на корреляцию столбцов между собой

In [9]:
correlations_data = train_dataset.corr()
correlations_data

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
X,1.0,-0.948814,0.323526,0.013002,-0.007674,-0.152726,0.081157,-0.075523,0.296784,-0.160692,-0.051763
Y,-0.948814,1.0,-0.354612,-0.021383,0.011371,0.220553,-0.097019,0.103626,-0.295333,0.131841,0.028731
MD,0.323526,-0.354612,1.0,0.095127,-0.035173,-0.585465,0.642244,-0.768646,0.744005,0.168198,0.126418
GR,0.013002,-0.021383,0.095127,1.0,-0.026661,0.321995,0.196266,0.018632,0.2426,-0.210572,-0.181713
RT,-0.007674,0.011371,-0.035173,-0.026661,1.0,0.055892,-0.043238,0.025907,-0.014688,-0.013899,0.005351
CN,-0.152726,0.220553,-0.585465,0.321995,0.055892,1.0,-0.598572,0.457451,-0.402441,-0.132066,-0.161738
DEN,0.081157,-0.097019,0.642244,0.196266,-0.043238,-0.598572,1.0,-0.472726,0.5631,0.019835,0.078136
ENV0,-0.075523,0.103626,-0.768646,0.018632,0.025907,0.457451,-0.472726,1.0,-0.383709,-0.681699,-0.200552
ENV1,0.296784,-0.295333,0.744005,0.2426,-0.014688,-0.402441,0.5631,-0.383709,1.0,-0.414055,0.010254
ENV2,-0.160692,0.131841,0.168198,-0.210572,-0.013899,-0.132066,0.019835,-0.681699,-0.414055,1.0,0.18956


### Заметим, что у нас есть довольно сильно коррелирующие между собой признаки. Уберем их для понижения размерности матрицы. Попробуем обучать и без понижения размерности и с понижением размерности, затем выберем наилучший
#### Взяли реализацию 
#### https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on/43104383#43104383

In [10]:
def corr_df(x, corr_val):
    '''
    Obj: Drops features that are strongly correlated to other features.
          This lowers model complexity, and aids in generalizing the model.
    Inputs:
          df: features df (x)
          corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
    Output: df that only includes uncorrelated features
    '''

    # Creates Correlation Matrix and Instantiates
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterates through Correlation Matrix Table to find correlated columns
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = item.values
            if val >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(i)

    drops = sorted(set(drop_cols))[::-1]

    # Drops the correlated columns
    for i in drops:
        col = x.iloc[:, (i+1):(i+2)].columns.values
        df = x.drop(col, axis=1)

    return df

corr_train_dataset = corr_df(train_dataset, 0.6)
corr_train_dataset

DEN | MD | 0.64
ENV1 | MD | 0.74


Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2,LITH_CODE
0,0.8179,2.9814,1602.0,83.9390,3.1660,0.25000,0.0,1.0,0.0,1200
1,0.8179,2.9814,1602.1,84.1660,3.1350,0.24600,0.0,1.0,0.0,1200
2,0.8179,2.9814,1602.2,85.0550,3.0890,0.24400,0.0,1.0,0.0,1200
3,0.8179,2.9814,1602.3,86.3520,3.0420,0.24200,0.0,1.0,0.0,1300
4,0.8179,2.9814,1602.4,87.6140,3.0030,0.24100,0.0,1.0,0.0,1300
...,...,...,...,...,...,...,...,...,...,...
45744,3.0000,0.0000,2275.2,103.6016,1.0789,0.29558,0.0,1.0,0.0,400
45745,3.0000,0.0000,2275.3,102.8472,1.0683,0.29264,0.0,1.0,0.0,400
45746,3.0000,0.0000,2275.4,102.5699,1.0790,0.29425,0.0,1.0,0.0,400
45747,3.0000,0.0000,2275.5,102.7901,1.1045,0.30096,0.0,1.0,0.0,400


### разобъём обучающий датасет на тренировочную и тестовую выборки с сохранением классов по LITH_CODE, отмасштабируем обучающую выборку на нулевое мат.ожидание и единичную дисперсию, затем применим это масштабирование к тестовой

In [11]:
#Разбиваем в первый раз на тест и обучение для масштабирования признаков только по обучающей выборке
X_corr = corr_train_dataset.iloc[:,0:-1]
y_corr = corr_train_dataset.iloc[:,-1]

X_corr_train, X_corr_test, y_corr_train, y_corr_test = train_test_split(X_corr, y_corr,
                                                                       train_size=0.67,
                                                                       random_state=6,
                                                                       stratify=y_corr)
X = train_dataset.iloc[:,0:-1]
y = train_dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.67,
                                                    random_state=6,
                                                    stratify=y)

In [12]:
X_corr_train

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2
45566,3.0000,0.0000,2257.400,114.3310,1.49526,0.35375,0.0,1.0,0.0
43776,3.0000,0.0000,2078.400,95.1834,4.20314,0.25954,0.0,1.0,0.0
12212,0.7564,1.9833,1112.100,82.0926,6.03830,0.29113,1.0,0.0,0.0
39472,3.0000,0.0000,1648.000,83.1299,3.61452,0.29175,0.0,0.0,1.0
29800,3.0000,0.0000,680.800,74.3726,5.81049,0.39157,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
7300,0.6137,3.0000,1176.700,91.0963,3.23050,0.34849,0.0,0.0,1.0
34800,3.0000,0.0000,1180.800,98.1963,3.37287,0.33273,1.0,0.0,0.0
14313,0.7564,1.9833,1322.200,88.6570,3.49080,0.31260,0.0,0.0,1.0
9884,0.7564,1.9833,879.300,100.4485,3.17670,0.38264,1.0,0.0,0.0


In [13]:
X_train

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2
45566,3.0000,0.0000,2257.400,114.3310,1.49526,0.35375,2.312700,0.0,1.0,0.0
43776,3.0000,0.0000,2078.400,95.1834,4.20314,0.25954,2.445300,0.0,1.0,0.0
12212,0.7564,1.9833,1112.100,82.0926,6.03830,0.29113,2.138300,1.0,0.0,0.0
39472,3.0000,0.0000,1648.000,83.1299,3.61452,0.29175,2.294100,0.0,0.0,1.0
29800,3.0000,0.0000,680.800,74.3726,5.81049,0.39157,2.075900,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7300,0.6137,3.0000,1176.700,91.0963,3.23050,0.34849,2.302448,0.0,0.0,1.0
34800,3.0000,0.0000,1180.800,98.1963,3.37287,0.33273,2.215000,1.0,0.0,0.0
14313,0.7564,1.9833,1322.200,88.6570,3.49080,0.31260,2.243700,0.0,0.0,1.0
9884,0.7564,1.9833,879.300,100.4485,3.17670,0.38264,2.119400,1.0,0.0,0.0


In [14]:
scaler_corr = preprocessing.StandardScaler().fit(X_corr_train)
corr_train_dataset.iloc[:,0:-1] = scaler_corr.transform(corr_train_dataset.iloc[:,0:-1])
corr_train_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2,LITH_CODE
0,-0.837657,1.409368,0.808620,-0.058873,-0.029915,-1.251057,-0.794124,2.082126,-0.861181,1200
1,-0.837657,1.409368,0.808879,-0.044573,-0.030352,-1.303434,-0.794124,2.082126,-0.861181,1200
2,-0.837657,1.409368,0.809139,0.011430,-0.031001,-1.329622,-0.794124,2.082126,-0.861181,1200
3,-0.837657,1.409368,0.809398,0.093134,-0.031663,-1.355811,-0.794124,2.082126,-0.861181,1300
4,-0.837657,1.409368,0.809657,0.172634,-0.032213,-1.368905,-0.794124,2.082126,-0.861181,1300
...,...,...,...,...,...,...,...,...,...,...
45744,1.311023,-1.301154,2.553478,1.179770,-0.059345,-0.654222,-0.794124,2.082126,-0.861181,400
45745,1.311023,-1.301154,2.553737,1.132247,-0.059495,-0.692719,-0.794124,2.082126,-0.861181,400
45746,1.311023,-1.301154,2.553996,1.114778,-0.059344,-0.671637,-0.794124,2.082126,-0.861181,400
45747,1.311023,-1.301154,2.554255,1.128650,-0.058984,-0.583775,-0.794124,2.082126,-0.861181,400


In [15]:
scaler = preprocessing.StandardScaler().fit(X_train)
train_dataset.iloc[:,0:-1] = scaler.transform(train_dataset.iloc[:,0:-1])
train_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
0,-0.837657,1.409368,0.808620,-0.058873,-0.029915,-1.251057,0.933553,-0.794124,2.082126,-0.861181,1200
1,-0.837657,1.409368,0.808879,-0.044573,-0.030352,-1.303434,0.986150,-0.794124,2.082126,-0.861181,1200
2,-0.837657,1.409368,0.809139,0.011430,-0.031001,-1.329622,0.986150,-0.794124,2.082126,-0.861181,1200
3,-0.837657,1.409368,0.809398,0.093134,-0.031663,-1.355811,1.005873,-0.794124,2.082126,-0.861181,1300
4,-0.837657,1.409368,0.809657,0.172634,-0.032213,-1.368905,1.097918,-0.794124,2.082126,-0.861181,1300
...,...,...,...,...,...,...,...,...,...,...,...
45744,1.311023,-1.301154,2.553478,1.179770,-0.059345,-0.654222,1.159061,-0.794124,2.082126,-0.861181,400
45745,1.311023,-1.301154,2.553737,1.132247,-0.059495,-0.692719,1.072277,-0.794124,2.082126,-0.861181,400
45746,1.311023,-1.301154,2.553996,1.114778,-0.059344,-0.671637,0.993382,-0.794124,2.082126,-0.861181,400
45747,1.311023,-1.301154,2.554255,1.128650,-0.058984,-0.583775,0.926978,-0.794124,2.082126,-0.861181,400


In [16]:
# Разбиваем второй раз, чтобы получить правильно отмасштабированные датасеты
X_corr = corr_train_dataset.iloc[:,0:-1]
y_corr = corr_train_dataset.iloc[:,-1]

X_corr_train, X_corr_test, y_corr_train, y_corr_test = train_test_split(X_corr, y_corr,
                                                                       train_size=0.67,
                                                                       random_state=6,
                                                                       stratify=y_corr)
X = train_dataset.iloc[:,0:-1]
y = train_dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.67,
                                                    random_state=6,
                                                    stratify=y)

### Объединим всё в один DF для упрощенной подачи в сеть

In [17]:
train_corr = pd.concat([X_corr_train, y_corr_train], axis=1)
train_corr

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2,LITH_CODE
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,-0.794124,2.082126,-0.861181,300
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,-0.794124,2.082126,-0.861181,400
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,1.259250,-0.480278,-0.861181,300
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,-0.794124,-0.480278,1.161196,1300
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,1.259250,-0.480278,-0.861181,600
...,...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,-0.794124,-0.480278,1.161196,300
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,1.259250,-0.480278,-0.861181,100
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,-0.794124,-0.480278,1.161196,1300
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,1.259250,-0.480278,-0.861181,100


In [18]:
test_corr = pd.concat([X_corr_test, y_corr_test], axis=1)
test_corr

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2,LITH_CODE
29062,-0.352207,0.600048,0.875965,0.961902,-0.030282,-0.725193,-0.794124,2.082126,-0.861181,400
4386,-1.038729,1.426278,-1.049762,-0.598329,-0.014223,0.440802,1.259250,-0.480278,-0.861181,500
20596,-0.352207,0.600048,-1.318325,0.885488,-0.030683,2.195082,1.259250,-0.480278,-0.861181,100
11424,-0.898215,0.501951,-0.665385,-0.839883,0.037464,-1.379380,1.259250,-0.480278,-0.861181,600
10682,-0.898215,0.501951,-0.857703,-0.602732,0.022184,-0.959841,1.259250,-0.480278,-0.861181,500
...,...,...,...,...,...,...,...,...,...,...
25615,-0.352207,0.600048,-0.017458,-2.291862,-0.014963,0.474107,-0.794124,-0.480278,1.161196,600
39221,1.311023,-1.301154,0.862791,0.458970,-0.038515,0.114409,-0.794124,-0.480278,1.161196,100
43077,1.311023,-1.301154,1.862222,1.172186,-0.058252,0.578076,-0.794124,2.082126,-0.861181,300
9505,-0.898215,0.501951,-1.162768,-0.893120,0.018092,-0.177723,1.259250,-0.480278,-0.861181,600


In [19]:
train = pd.concat([X_train, y_train], axis=1)
train

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,0.727768,-0.794124,2.082126,-0.861181,300
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,1.599559,-0.794124,2.082126,-0.861181,400
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,-0.418840,1.259250,-0.480278,-0.861181,300
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,0.605481,-0.794124,-0.480278,1.161196,1300
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,-0.829094,1.259250,-0.480278,-0.861181,600
...,...,...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,0.660366,-0.794124,-0.480278,1.161196,300
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,0.085431,1.259250,-0.480278,-0.861181,100
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,0.274122,-0.794124,-0.480278,1.161196,1300
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,-0.543099,1.259250,-0.480278,-0.861181,100


In [20]:
test = pd.concat([X_test, y_test], axis=1)
test

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
29062,-0.352207,0.600048,0.875965,0.961902,-0.030282,-0.725193,1.456035,-0.794124,2.082126,-0.861181,400
4386,-1.038729,1.426278,-1.049762,-0.598329,-0.014223,0.440802,-0.381082,1.259250,-0.480278,-0.861181,500
20596,-0.352207,0.600048,-1.318325,0.885488,-0.030683,2.195082,-1.360649,1.259250,-0.480278,-0.861181,100
11424,-0.898215,0.501951,-0.665385,-0.839883,0.037464,-1.379380,-1.820542,1.259250,-0.480278,-0.861181,600
10682,-0.898215,0.501951,-0.857703,-0.602732,0.022184,-0.959841,-0.499050,1.259250,-0.480278,-0.861181,500
...,...,...,...,...,...,...,...,...,...,...,...
25615,-0.352207,0.600048,-0.017458,-2.291862,-0.014963,0.474107,-1.075707,-0.794124,-0.480278,1.161196,600
39221,1.311023,-1.301154,0.862791,0.458970,-0.038515,0.114409,0.585100,-0.794124,-0.480278,1.161196,100
43077,1.311023,-1.301154,1.862222,1.172186,-0.058252,0.578076,0.802061,-0.794124,2.082126,-0.861181,300
9505,-0.898215,0.501951,-1.162768,-0.893120,0.018092,-0.177723,-0.309701,1.259250,-0.480278,-0.861181,600


# Все испробованные нейросетевые подходы к решению задачи не удались. Весь код ниже закомментирован, но код в ячейках рабочий. При желании - восстановить

### В планах использование softmax, поэтому применим one_hot еще и к LITH_CODE

In [21]:
# tmp = pd.get_dummies(test.LITH_CODE)
# test_cl = pd.concat([test, tmp], axis=1)
# test_cl = test_cl.drop(['LITH_CODE'], axis=1)

# tmp = pd.get_dummies(train.LITH_CODE)
# train_cl = pd.concat([train, tmp], axis=1)
# train_cl = train_cl.drop(['LITH_CODE'], axis=1)

# tmp = pd.get_dummies(test_corr.LITH_CODE)
# test_corr_cl = pd.concat([test_corr, tmp], axis=1)
# test_corr_cl = test_corr_cl.drop(['LITH_CODE'], axis=1)

# tmp = pd.get_dummies(train_corr.LITH_CODE)
# # train_corr_cl = pd.concat([train_corr, tmp], axis=1)
# train_corr_cl = train_corr_cl.drop(['LITH_CODE'], axis=1)


In [22]:
# train_corr_cl.iloc[:,0:9]

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,-0.794124,2.082126,-0.861181
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,-0.794124,2.082126,-0.861181
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,1.259250,-0.480278,-0.861181
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,-0.794124,-0.480278,1.161196
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,1.259250,-0.480278,-0.861181
...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,-0.794124,-0.480278,1.161196
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,1.259250,-0.480278,-0.861181
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,-0.794124,-0.480278,1.161196
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,1.259250,-0.480278,-0.861181


In [23]:
# train_corr_cl.iloc[:,9:]

Unnamed: 0,100,200,300,400,500,600,800,1000,1100,1200,1300,1400,1500
45566,0,0,1,0,0,0,0,0,0,0,0,0,0
43776,0,0,0,1,0,0,0,0,0,0,0,0,0
12212,0,0,1,0,0,0,0,0,0,0,0,0,0
39472,0,0,0,0,0,0,0,0,0,0,1,0,0
29800,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7300,0,0,1,0,0,0,0,0,0,0,0,0,0
34800,1,0,0,0,0,0,0,0,0,0,0,0,0
14313,0,0,0,0,0,0,0,0,0,0,1,0,0
9884,1,0,0,0,0,0,0,0,0,0,0,0,0


## Пробуем нейросетевой подход для задачи классификации

In [24]:
# import torch.nn as nn
# import torch.nn.functional as F
# import torch
# import torch.optim as optim
# from torch.utils.tensorboard import SummaryWriter
# from torch.utils.data import Dataset
# from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
# import sys
# import torchvision
# torch.backends.cudnn.enabled = False

In [25]:
# def get_device():
#     if torch.cuda.is_available():
#         device = torch.device('cuda:0')
#     else:
#         device = torch.device('cpu') # don't have GPU 
#     return device

# device = get_device()
# print(device)   
# print(torch.cuda.is_available())
# print(torch.cuda.device_count())

cuda:0
True
1


In [26]:
# train_corr_cl = torch.tensor(train_corr_cl.values).to(device)
# test_corr_cl = torch.tensor(test_corr_cl.values).to(device)

# train_corr = torch.tensor(train_corr.values).to(device)
# test_corr = torch.tensor(test_corr.values).to(device)

### Пробуем разные архитектуры полносвязных сетей

### Для данной архитектуры https://tensorboard.dev/experiment/p6JNLHmAQ3uAnOj2Kn43Kg/#scalars

In [27]:
# class Net1_corr(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(9, 64)
#         self.fc2 = nn.Linear(64,13)
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.softmax(self.fc2(x), dim=-1)
#         return x
# net1_corr = Net1_corr()
# torch.save(net1_corr, 'net1.pth')
# net1_corr = net1_corr.double().to(device)
# loss_function = nn.MSELoss()

In [143]:
# writer = SummaryWriter('./tensorboard/net1_corr/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# lr = 0.0008
# optimizer = optim.Adam(net1_corr.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100


# for batch in tqdm(range(1000)):
#     trainset = torch.utils.data.DataLoader(train_corr_cl,
#                                           batch_size=256,
#                                           shuffle=True)
#     testset = torch.utils.data.DataLoader(test_corr_cl,
#                                          batch_size=256, 
#                                          shuffle=True)
#     for data in trainset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         net1_corr.zero_grad()
#         output = net1_corr(X.double())
#         loss = loss_function(output, y)
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         output = net1_corr(X.double())
#         loss = loss_function(output, y)
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 100 батчей
#             min_loss = running_test_loss / 50
#             torch.save(net1_corr.state_dict(), 'model_weights/net1_corr.pth') # saving model
#     running_train_loss = 0
#     running_test_loss = 0

100%|██████████| 1000/1000 [10:20<00:00,  1.61it/s]


## Пробуем применить задачу регрессии оставив несколько выходов
#### https://tensorboard.dev/experiment/9oy9n7bLTkmZddabQI5A2Q/#scalars

In [194]:
# class Net2_corr(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(9, 64)
#         self.fc2 = nn.Linear(64,13)
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         return x
# net2_corr = Net2_corr()
# torch.save(net2_corr, 'net2.pth')
# net2_corr = net2_corr.double().to(device)
# loss_function = nn.MSELoss()

In [195]:
# writer = SummaryWriter('./tensorboard/net2_corr/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# lr = 0.0008
# optimizer = optim.Adam(net2_corr.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100


# for batch in tqdm(range(1000)):
#     trainset = torch.utils.data.DataLoader(train_corr_cl,
#                                           batch_size=256,
#                                           shuffle=True)
#     testset = torch.utils.data.DataLoader(test_corr_cl,
#                                          batch_size=256, 
#                                          shuffle=True)
#     for data in trainset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         net1_corr.zero_grad()
#         output = net2_corr(X.double())
#         loss = loss_function(output, y)
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         output = net2_corr(X.double())
#         loss = loss_function(output, y)
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 100 батчей
#             min_loss = running_test_loss / 50
#             torch.save(net2_corr.state_dict(), 'model_weights/net2_corr.pth') # saving model
#     running_train_loss = 0
#     running_test_loss = 0

100%|██████████| 1000/1000 [10:34<00:00,  1.58it/s]


### Задача регрессии с одним выходом c Huber Loss
#### https://tensorboard.dev/experiment/DGcWc4vuRhy2J73FlC29Jw/#scalars

In [196]:
# class Net3_corr(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(9, 64)
#         self.fc2 = nn.Linear(64,1)
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         return x
# net3_corr = Net3_corr()
# torch.save(net3_corr, 'net3.pth')
# net3_corr = net3_corr.double().to(device)
# loss_function = nn.SmoothL1Loss()

In [197]:
# writer = SummaryWriter('./tensorboard/net3_corr/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# lr = 0.0008
# optimizer = optim.Adam(net3_corr.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100


# for batch in tqdm(range(1000)):
#     trainset = torch.utils.data.DataLoader(train_corr,
#                                           batch_size=256,
#                                           shuffle=True)
#     testset = torch.utils.data.DataLoader(test_corr,
#                                          batch_size=256, 
#                                          shuffle=True)
#     for data in trainset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         net1_corr.zero_grad()
#         output = net3_corr(X.double())
#         loss = loss_function(output, y)
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         output = net3_corr(X.double())
#         loss = loss_function(output, y)
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 100 батчей
#             min_loss = running_test_loss / 50
#             torch.save(net3_corr.state_dict(), 'model_weights/net3_corr.pth') # saving model
#             print('weights saved. Batch ', batch)
#     running_train_loss = 0
#     running_test_loss = 0

100%|██████████| 1000/1000 [10:18<00:00,  1.62it/s]


### Эта же архитектура, изменяем конфигурацию
https://tensorboard.dev/experiment/lIkLlKD6Tn2am6Pm9iJ2xQ/#scalars

In [30]:
# class Net4_corr(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.fc1 = nn.Linear(9, 16)
#         self.fc2 = nn.Linear(16,8)
#         self.fc3 = nn.Linear(8,1)
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#         return x
# net4_corr = Net4_corr()
# torch.save(net4_corr, 'net4.pth')
# net4_corr = net4_corr.double().to(device)
# loss_function = nn.SmoothL1Loss()

In [34]:
# writer = SummaryWriter('./tensorboard/net4_corr/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# lr = 0.0032
# optimizer = optim.Adam(net4_corr.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100


# for batch in tqdm(range(3000)):
#     trainset = torch.utils.data.DataLoader(train_corr,
#                                           batch_size=128,
#                                           shuffle=True)
#     testset = torch.utils.data.DataLoader(test_corr,
#                                          batch_size=128, 
#                                          shuffle=True)
#     for data in trainset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         net1_corr.zero_grad()
#         output = net4_corr(X.double())
#         output = torch.round(output/100)*100
#         loss = loss_function(output, y)
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:9]
#         y = data[:, 9:]
#         output = net4_corr(X.double())
#         loss = loss_function(output, y)
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 100 батчей
#             min_loss = running_test_loss / 50
#             torch.save(net4_corr.state_dict(), 'model_weights/net4_corr.pth') # saving model
#     running_train_loss = 0
#     running_test_loss = 0

100%|██████████| 3000/3000 [44:28<00:00,  1.12it/s]


### Пробуем через кросс-энтропию

In [26]:
# class2idx = {
#     100:0,
#     200:1,
#     300:2,
#     400:3,
#     500:4,
#     600:5,
#     800:6,
#     1000:7,
#     1100:8,
#     1200:9,
#     1300:10,
#     1400:11,
#     1500:12
# }

# idx2class = {v: k for k, v in class2idx.items()}
# train['LITH_CODE'].replace(class2idx, inplace=True)
# train


Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,0.727768,-0.794124,2.082126,-0.861181,2
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,1.599559,-0.794124,2.082126,-0.861181,3
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,-0.418840,1.259250,-0.480278,-0.861181,2
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,0.605481,-0.794124,-0.480278,1.161196,10
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,-0.829094,1.259250,-0.480278,-0.861181,5
...,...,...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,0.660366,-0.794124,-0.480278,1.161196,2
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,0.085431,1.259250,-0.480278,-0.861181,0
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,0.274122,-0.794124,-0.480278,1.161196,10
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,-0.543099,1.259250,-0.480278,-0.861181,0


In [27]:
# test['LITH_CODE'].replace(class2idx, inplace=True)
# test

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
29062,-0.352207,0.600048,0.875965,0.961902,-0.030282,-0.725193,1.456035,-0.794124,2.082126,-0.861181,3
4386,-1.038729,1.426278,-1.049762,-0.598329,-0.014223,0.440802,-0.381082,1.259250,-0.480278,-0.861181,4
20596,-0.352207,0.600048,-1.318325,0.885488,-0.030683,2.195082,-1.360649,1.259250,-0.480278,-0.861181,0
11424,-0.898215,0.501951,-0.665385,-0.839883,0.037464,-1.379380,-1.820542,1.259250,-0.480278,-0.861181,5
10682,-0.898215,0.501951,-0.857703,-0.602732,0.022184,-0.959841,-0.499050,1.259250,-0.480278,-0.861181,4
...,...,...,...,...,...,...,...,...,...,...,...
25615,-0.352207,0.600048,-0.017458,-2.291862,-0.014963,0.474107,-1.075707,-0.794124,-0.480278,1.161196,5
39221,1.311023,-1.301154,0.862791,0.458970,-0.038515,0.114409,0.585100,-0.794124,-0.480278,1.161196,0
43077,1.311023,-1.301154,1.862222,1.172186,-0.058252,0.578076,0.802061,-0.794124,2.082126,-0.861181,2
9505,-0.898215,0.501951,-1.162768,-0.893120,0.018092,-0.177723,-0.309701,1.259250,-0.480278,-0.861181,5


In [121]:
# train_tensor = torch.tensor(train.values).to(device)
# test_tensor = torch.tensor(test.values).to(device)

In [122]:
# class MulticlassClassification(nn.Module):
#     def __init__(self, num_feature, num_class):
#         super(MulticlassClassification, self).__init__()
        
#         self.layer_1 = nn.Linear(num_feature, 128)
#         self.layer_2 = nn.Linear(128, 128)
#         self.layer_3 = nn.Linear(128, 64)
#         self.layer_out = nn.Linear(64, num_class) 
        
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(p=0.2)
#         self.batchnorm1 = nn.BatchNorm1d(128)
#         self.batchnorm2 = nn.BatchNorm1d(128)
#         self.batchnorm3 = nn.BatchNorm1d(64)
        
#     def forward(self, x):
#         x = self.layer_1(x)
#         x = self.batchnorm1(x)
#         x = self.relu(x)
        
#         x = self.layer_2(x)
#         x = self.batchnorm2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
        
#         x = self.layer_3(x)
#         x = self.batchnorm3(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
#         x = self.layer_out(x)
        
#         return x

In [123]:
# BATCH_SIZE = 16
# LEARNING_RATE = 0.0016
# NUM_FEATURES = len(train.columns)-1
# NUM_CLASSES = 13
# model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
# model.double().to(device)
# torch.save(model, 'multiclass.pth')
# loss_function = nn.CrossEntropyLoss()
# #optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
# optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)


In [124]:
# train_loader = DataLoader(dataset=train_tensor,
#                           batch_size=BATCH_SIZE)
# test_loader = DataLoader(dataset=test_tensor, batch_size=BATCH_SIZE)

In [None]:
# writer = SummaryWriter('./tensorboard/MulticlassClassification/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# #lr = 0.0032
# #optimizer = optim.Adam(model.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100

# trainset = torch.utils.data.DataLoader(train_tensor,
#                                        batch_size=256,
#                                        shuffle=True)
# testset = torch.utils.data.DataLoader(test_tensor,
#                                       batch_size=256,
#                                       shuffle=True)

# train_loader = DataLoader(dataset=train_tensor,
#                           batch_size=BATCH_SIZE)
# test_loader = DataLoader(dataset=test_tensor, batch_size=BATCH_SIZE)

# for batch in tqdm(range(3000)):
#     for data in trainset:
#         X = data[:, 0:NUM_FEATURES]
#         y = data[:, NUM_FEATURES:]
#         optimizer.zero_grad()
#         output = model(X.double())
#         y = y.type(torch.LongTensor).to(device)
#         loss = loss_function(output, y.squeeze())
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:NUM_FEATURES]
#         y = data[:, NUM_FEATURES:]
#         output = model(X.double())
#         y = y.type(torch.LongTensor).to(device)
#         loss = loss_function(output, y.squeeze())
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
        
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 50 батчей
#             min_loss = running_test_loss / 50
#             torch.save(model.state_dict(), 'model_weights/MulticlassClassification.pth') # saving model
#         running_train_loss = 0
#         running_test_loss = 0

### Пытаемся уменьшить лернинг рейт

In [147]:
# class MulticlassClassification1(nn.Module):
#     def __init__(self, num_feature, num_class):
#         super(MulticlassClassification1, self).__init__()
        
#         self.layer_1 = nn.Linear(num_feature, 128)
#         self.layer_2 = nn.Linear(128, 128)
#         self.layer_3 = nn.Linear(128, 64)
#         self.layer_out = nn.Linear(64, num_class) 
        
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(p=0.2)
#         self.batchnorm1 = nn.BatchNorm1d(128)
#         self.batchnorm2 = nn.BatchNorm1d(128)
#         self.batchnorm3 = nn.BatchNorm1d(64)
        
#     def forward(self, x):
#         x = self.layer_1(x)
#         x = self.batchnorm1(x)
#         x = self.relu(x)
        
#         x = self.layer_2(x)
#         x = self.batchnorm2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
        
#         x = self.layer_3(x)
#         x = self.batchnorm3(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
#         x = self.layer_out(x)
        
#         return x

In [148]:
# #model_upgrade = torch.load('./finish_worse/multiclass.pth')
# model_upgrade =  MulticlassClassification1(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
# model_upgrade.load_state_dict(torch.load('./finish_worse/MulticlassClassification.pth'))
# model_upgrade.eval()

MulticlassClassification1(
  (layer_1): Linear(in_features=10, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=128, bias=True)
  (layer_3): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=13, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [149]:
# BATCH_SIZE = 256
# LEARNING_RATE = 0.0004
# NUM_FEATURES = len(train.columns)-1
# NUM_CLASSES = 13
# model_upgrade.double().to(device)
# torch.save(model_upgrade, 'multiclass_upgrade.pth')
# loss_function = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model_upgrade.parameters(), lr=LEARNING_RATE)

In [None]:
# writer = SummaryWriter('./tensorboard/MulticlassClassification_upgrade/')
# running_test_loss = 0.0
# running_train_loss = 0.0
# #lr = 0.0032
# #optimizer = optim.Adam(model.parameters(), lr=lr)
# batch = 0
# min_loss = 1000
# was_loss = 100

# trainset = torch.utils.data.DataLoader(train_tensor,
#                                        batch_size=256,
#                                        shuffle=True)
# testset = torch.utils.data.DataLoader(test_tensor,
#                                       batch_size=256,
#                                       shuffle=True)

# train_loader = DataLoader(dataset=train_tensor,
#                           batch_size=BATCH_SIZE)
# test_loader = DataLoader(dataset=test_tensor, batch_size=BATCH_SIZE)

# for batch in tqdm(range(3000)):
#     for data in trainset:
#         X = data[:, 0:NUM_FEATURES]
#         y = data[:, NUM_FEATURES:]
#         optimizer.zero_grad()
#         output = model_upgrade(X.double())
#         y = y.type(torch.LongTensor).to(device)
#         loss = loss_function(output, y.squeeze())
#         loss.backward()
#         optimizer.step()
#     train_loss = (loss.cpu().detach().numpy())
#     for data in testset:
#         X = data[:, 0:NUM_FEATURES]
#         y = data[:, NUM_FEATURES:]
#         output = model_upgrade(X.double())
#         y = y.type(torch.LongTensor).to(device)
#         loss = loss_function(output, y.squeeze())
#     test_loss = (loss.cpu().detach().numpy())
#     running_test_loss += test_loss
#     running_train_loss += train_loss
#     if ((batch % 50) == 49):
#         writer.add_scalars(f'loss',{
#             'training':running_train_loss / 50,
#             'test': running_test_loss / 50,
#         }, batch+1)
        
#         if ((running_test_loss / 50) <  min_loss):#сохранение весов по среднему из 50 батчей
#             min_loss = running_test_loss / 50
#             torch.save(model_upgrade.state_dict(), 'model_weights/MulticlassClassification_upgrade.pth') # saving model
#         running_train_loss = 0
#         running_test_loss = 0

# В конце концов попробуем SK-Learn

In [202]:
X_corr = corr_train_dataset.iloc[:,0:-1]
y_corr = corr_train_dataset.iloc[:,-1]

X_corr_train, X_corr_test, y_corr_train, y_corr_test = train_test_split(X_corr, y_corr,
                                                                       train_size=0.8,
                                                                       random_state=6,
                                                                       stratify=y_corr)

In [203]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X_corr_train, y_corr_train)
rfc.predict(X_corr_test)

array([100, 400, 400, ..., 500, 600, 400])

In [204]:
f1_score(y_corr_test,rfc.predict(X_corr_test), average='macro' )

0.8414961782177298

In [196]:
rfc1 = RandomForestClassifier()
rfc1.fit(X_train, y_train)
rfc1.predict(X_test)

array([400, 500, 100, ..., 300, 600, 600])

In [197]:
f1_score(y_test,rfc1.predict(X_test), average='macro' )

0.7900500409642159

In [200]:
param_grid = { 
    'n_estimators': [100,200,80],
    'max_features': ['sqrt', 'auto'],
    'min_samples_split':[2,5,10,23,12],
    'min_samples_leaf':[1,2],
    'max_depth': [13,14,15],
    'bootstrap': ['False','True'],
    'criterion' :['gini', 'entropy']
}

## Не хватило времени на поиск по сетке. В итоге решено воспользоваться базовым классификатором

In [201]:
from sklearn.model_selection import GridSearchCV
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3, n_jobs=4)
CV_rfc.fit(X_corr_train, y_corr_train)



KeyboardInterrupt: 

In [192]:
CV_rfc.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
rfc_3 = CV_rfc.best_estimator_

In [178]:
rfc_3.fit(X_corr_train, y_corr_train)

RandomForestClassifier(criterion='entropy', max_depth=16, max_features='sqrt',
                       n_estimators=500, n_jobs=-1, random_state=42)

In [179]:
f1_score(y_corr_test,rfc_3.predict(X_corr_test), average='macro' )

0.7986153067712408

In [205]:
validation_dataset = validation_dataset.drop(['DEN'], axis=1)
validation_dataset.iloc[:,:] = scaler_corr.transform(validation_dataset.iloc[:,:])


KeyError: "['DEN'] not found in axis"

In [206]:
validation_dataset 

Unnamed: 0,X,Y,MD,GR,RT,CN,ENV0,ENV1,ENV2
0,1.306690,-1.295608,-1.018491,0.493618,-0.010855,0.237233,1.259250,-0.480278,-0.861181
1,1.306690,-1.295608,-1.018231,0.568663,-0.013828,0.313834,1.259250,-0.480278,-0.861181
2,1.306690,-1.295608,-1.017972,0.628124,-0.018309,0.439932,1.259250,-0.480278,-0.861181
3,1.306690,-1.295608,-1.017713,0.677733,-0.022804,0.584754,1.259250,-0.480278,-0.861181
4,1.306690,-1.295608,-1.017454,0.730516,-0.023311,0.725909,1.259250,-0.480278,-0.861181
...,...,...,...,...,...,...,...,...,...
28992,-0.513105,1.143807,1.108761,1.504295,-0.043702,-0.367066,-0.794124,2.082126,-0.861181
28993,-0.513105,1.143807,1.109020,1.460941,-0.043308,-0.389195,-0.794124,2.082126,-0.861181
28994,-0.513105,1.143807,1.109279,1.459612,-0.042800,-0.425597,-0.794124,2.082126,-0.861181
28995,-0.513105,1.143807,1.109538,1.499677,-0.042045,-0.470248,-0.794124,2.082126,-0.861181


In [207]:
answer = rfc.predict(validation_dataset)

In [208]:
np.savetxt("LITH_CODE.csv", answer, delimiter=",",fmt='%1.f',encoding='utf-8')