### Перенесем данные в pandas DF для удобства работы

In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [40]:
train_dataset = pd.read_csv('../Train-dataset.csv')
train_dataset.head()

Unnamed: 0,WELL,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_NAME,LITH_CODE
0,Well-1,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,Marly sandstone,1200
1,Well-1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,Marly sandstone,1200
2,Well-1,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,Marly sandstone,1200
3,Well-1,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,Sandy marl,1300
4,Well-1,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,Sandy marl,1300


In [41]:
validation_dataset = pd.read_csv('../Validation-dataset.csv')
validation_dataset.head()

Unnamed: 0,WELL,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT
0,Well-X,2.9956,0.0061,897.065,92.7094,4.51768,0.36366,2.2106,Continental
1,Well-X,2.9956,0.0061,897.165,93.9007,4.30683,0.36951,2.2036,Continental
2,Well-X,2.9956,0.0061,897.265,94.8446,3.98902,0.37914,2.1969,Continental
3,Well-X,2.9956,0.0061,897.365,95.6321,3.67028,0.3902,2.1923,Continental
4,Well-X,2.9956,0.0061,897.465,96.47,3.63431,0.40098,2.1926,Continental


### Очевидно, что название колонн не информативный признак т.к. никак не относится к значениям приборов или географическим положениям, тем более на валидационном ДС он совсем другой. Уберем его из ДС. Аналогично рассуждая уберем и LITH_NAME

In [42]:
train_dataset = train_dataset.drop(['WELL', 'LITH_NAME'], axis=1)
train_dataset.head()

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_CODE
0,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,1200
1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,1200
2,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,1200
3,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,1300
4,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,1300


In [43]:
validation_dataset = validation_dataset.drop(['WELL'], axis=1)
validation_dataset.head()

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT
0,2.9956,0.0061,897.065,92.7094,4.51768,0.36366,2.2106,Continental
1,2.9956,0.0061,897.165,93.9007,4.30683,0.36951,2.2036,Continental
2,2.9956,0.0061,897.265,94.8446,3.98902,0.37914,2.1969,Continental
3,2.9956,0.0061,897.365,95.6321,3.67028,0.3902,2.1923,Continental
4,2.9956,0.0061,897.465,96.47,3.63431,0.40098,2.1926,Continental


###  Очевидно, что DEPOSITIONAL_ENVIRONMENT категориальный признак и его нужно закодировать. Т.к. диспозиции всего 3, то можно применить one-hot-кодирование при этом матрица всё еще не будет сильно разреженной

In [44]:
ohe = OneHotEncoder(sparse=False)
new_ohe_features = ohe.fit_transform(train_dataset.DEPOSITIONAL_ENVIRONMENT.values.reshape(-1,1))
tmp = pd.DataFrame(new_ohe_features, columns=['ENV=' + 
                                             str(i) for i in range(new_ohe_features.shape[1])])
train_dataset = pd.concat([train_dataset, tmp], axis=1)
train_dataset = train_dataset.drop(['DEPOSITIONAL_ENVIRONMENT'], axis=1)
cols = train_dataset.columns.tolist()
cols = cols[1:7] + cols[8:11] + cols[7:8]
train_dataset = train_dataset[cols]
train_dataset.head()

Unnamed: 0,Y,MD,GR,RT,CN,DEN,ENV=0,ENV=1,ENV=2,LITH_CODE
0,2.9814,1602.0,83.939,3.166,0.25,2.344,0.0,1.0,0.0,1200
1,2.9814,1602.1,84.166,3.135,0.246,2.352,0.0,1.0,0.0,1200
2,2.9814,1602.2,85.055,3.089,0.244,2.352,0.0,1.0,0.0,1200
3,2.9814,1602.3,86.352,3.042,0.242,2.355,0.0,1.0,0.0,1300
4,2.9814,1602.4,87.614,3.003,0.241,2.369,0.0,1.0,0.0,1300


### С помощью кода со Stackoverflow проверим все ли признаки у нас присутвуют и нет ли у нас отсутсвующих значений
##### https://stackoverflow.com/questions/26266362/how-to-count-the-nan-values-in-a-column-in-pandas-dataframe/39734251#39734251

In [47]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns
missing_values_table(train_dataset)

Your selected dataframe has 10 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


### Посмотрим на корреляцию столбцов между собой

In [45]:
correlations_data = train_dataset.corr()
correlations_data

Unnamed: 0,Y,MD,GR,RT,CN,DEN,ENV=0,ENV=1,ENV=2,LITH_CODE
Y,1.0,-0.354612,-0.021383,0.011371,0.220553,-0.097019,0.103626,-0.295333,0.131841,0.028731
MD,-0.354612,1.0,0.095127,-0.035173,-0.585465,0.642244,-0.768646,0.744005,0.168198,0.126418
GR,-0.021383,0.095127,1.0,-0.026661,0.321995,0.196266,0.018632,0.2426,-0.210572,-0.181713
RT,0.011371,-0.035173,-0.026661,1.0,0.055892,-0.043238,0.025907,-0.014688,-0.013899,0.005351
CN,0.220553,-0.585465,0.321995,0.055892,1.0,-0.598572,0.457451,-0.402441,-0.132066,-0.161738
DEN,-0.097019,0.642244,0.196266,-0.043238,-0.598572,1.0,-0.472726,0.5631,0.019835,0.078136
ENV=0,0.103626,-0.768646,0.018632,0.025907,0.457451,-0.472726,1.0,-0.383709,-0.681699,-0.200552
ENV=1,-0.295333,0.744005,0.2426,-0.014688,-0.402441,0.5631,-0.383709,1.0,-0.414055,0.010254
ENV=2,0.131841,0.168198,-0.210572,-0.013899,-0.132066,0.019835,-0.681699,-0.414055,1.0,0.18956
LITH_CODE,0.028731,0.126418,-0.181713,0.005351,-0.161738,0.078136,-0.200552,0.010254,0.18956,1.0


### Заметим, что у нас есть довольно сильно коррелирующие между собой признаки. Уберем их для понижения размерности матрицы. Попробуем обучать и без понижения размерности и с понижением размерности, затем выберем наилучший
#### Взяли реализацию 
#### https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on/43104383#43104383

In [50]:
def corr_df(x, corr_val):
    '''
    Obj: Drops features that are strongly correlated to other features.
          This lowers model complexity, and aids in generalizing the model.
    Inputs:
          df: features df (x)
          corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
    Output: df that only includes uncorrelated features
    '''

    # Creates Correlation Matrix and Instantiates
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterates through Correlation Matrix Table to find correlated columns
    for i in iters:
        for j in range(i):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = item.values
            if val >= corr_val:
                # Prints the correlated feature set and the corr val
                print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(i)

    drops = sorted(set(drop_cols))[::-1]

    # Drops the correlated columns
    for i in drops:
        col = x.iloc[:, (i+1):(i+2)].columns.values
        df = x.drop(col, axis=1)

    return df

corr_train_dataset = corr_df(train_dataset, 0.6)
corr_train_dataset

DEN | MD | 0.64
ENV=1 | MD | 0.74


Unnamed: 0,Y,MD,GR,RT,CN,ENV=0,ENV=1,ENV=2,LITH_CODE
0,2.9814,1602.0,83.9390,3.1660,0.25000,0.0,1.0,0.0,1200
1,2.9814,1602.1,84.1660,3.1350,0.24600,0.0,1.0,0.0,1200
2,2.9814,1602.2,85.0550,3.0890,0.24400,0.0,1.0,0.0,1200
3,2.9814,1602.3,86.3520,3.0420,0.24200,0.0,1.0,0.0,1300
4,2.9814,1602.4,87.6140,3.0030,0.24100,0.0,1.0,0.0,1300
...,...,...,...,...,...,...,...,...,...
45744,0.0000,2275.2,103.6016,1.0789,0.29558,0.0,1.0,0.0,400
45745,0.0000,2275.3,102.8472,1.0683,0.29264,0.0,1.0,0.0,400
45746,0.0000,2275.4,102.5699,1.0790,0.29425,0.0,1.0,0.0,400
45747,0.0000,2275.5,102.7901,1.1045,0.30096,0.0,1.0,0.0,400


### Разобъём обучающий датасет на тренировочную и тестовую выборки с сохранением классов по LITH_CODE 

In [68]:
X_corr = corr_train_dataset.iloc[:,0:-1]
y_corr = corr_train_dataset.iloc[:,-1]
X_corr_train, X_corr_test, y_corr_train, y_corr_test = train_test_split(X_corr, y_corr,
                                                                       train_size=0.67,
                                                                       random_state=6,
                                                                       stratify=y_corr)
X = train_dataset.iloc[:,0:-1]
y = train_dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.67,
                                                    random_state=6,
                                                    stratify=y)