# Есть набор данных о месторождении и местодобычи нефти, необходимо определить тип месторождения.

Что мы будем делать?
1. Провести исследование данных (какие есть столбцы, какие типы данных в этих столбцах, есть пропуски в данных, понять какие у нас есть признаки и какая переменная является целевой, есть ли категориальные признаки и т.д.)
2. Подготовка данных: заполнение/удаление пропусков, кодирование категориальных признаков, выделение вектора признаков и вектора ответов
3. Оценим соотношение классов и посмотрим сбалансированная ли у нас выборка
4. Подумаем какая модель нам лучше всего подойдет (логистическая регрессия, дерево решений, градиентный бустинг, метод опорных векторов, случаный лес, нейронка и т.д.)
5. На данных train обучим и оценим нашу модель (по заданию: измерять качество модели с помощью accuracy)
6. Аналогичным образом готовим набор данных test и получаем для него ответы (оценить эти ответы не сможем)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 1. Исследование данных

In [None]:
train

Unnamed: 0,Tectonic regime,Onshore/Offshore,Hydrocarbon type,Reservoir status,Structural setting,Depth,Period,Lithology,Gross,Netpay,Porosity,Permeability
0,STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,OFFSHORE,OIL,DEVELOPING,INVERSION/WRENCH,3520,NEOGENE,SANDSTONE,2460.0,220.0,20.0,45.0
1,GRAVITY/EXTENSION/EVAPORITE,OFFSHORE,OIL,MATURE PRODUCTION,SALT/PASSIVE MARGIN,9967,CRETACEOUS,LIMESTONE,427.0,160.0,19.0,175.0
2,GRAVITY/EXTENSION/EVAPORITE,ONSHORE,OIL,MATURE PRODUCTION,PASSIVE MARGIN,8700,CRETACEOUS,LIMESTONE,95.0,15.0,12.0,20.0
3,COMPRESSION,ONSHORE,OIL,DECLINING PRODUCTION,THRUST,5084,CRETACEOUS,SANDSTONE,328.0,300.0,13.0,600.0
4,INVERSION/COMPRESSION/EXTENSION,ONSHORE,OIL,DECLINING PRODUCTION,INVERSION/RIFT,1030,CRETACEOUS,SANDSTONE,260.0,33.0,24.0,182.0
...,...,...,...,...,...,...,...,...,...,...,...,...
304,GRAVITY/EXTENSION/EVAPORITE/SYNSEDIMENTATION,OFFSHORE,OIL,DECLINING PRODUCTION,DELTA/SUB-SALT/PASSIVE MARGIN,13265,NEOGENE,LOW-RESISTIVITY SANDSTONE,1500.0,295.0,29.0,1500.0
305,INVERSION/COMPRESSION/EXTENSION,OFFSHORE,OIL,DECLINING PRODUCTION,RIFT/PASSIVE MARGIN,1657,CRETACEOUS,LOW-RESISTIVITY SANDSTONE,164.0,98.0,32.0,7500.0
306,COMPRESSION/EVAPORITE,ONSHORE,OIL,CONTINUING DEVELOPMENT,FORELAND,10211,CRETACEOUS,CHALKY LIMESTONE,328.0,213.0,13.0,0.8
307,INVERSION/COMPRESSION/EXTENSION/EVAPORITE,OFFSHORE,GAS-CONDENSATE,PLATEAU PRODUCTION,SALT/RIFT,16360,JURASSIC,SANDSTONE,980.0,490.0,16.0,10.0


In [None]:
train['Onshore/Offshore'].value_counts() #целевая переменная

ONSHORE             211
OFFSHORE             93
ONSHORE-OFFSHORE      5
Name: Onshore/Offshore, dtype: int64

In [None]:
train.isna().sum() # проверим есть ли пропуски в данных

Tectonic regime       0
Onshore/Offshore      0
Hydrocarbon type      0
Reservoir status      0
Structural setting    0
Depth                 0
Period                0
Lithology             0
Gross                 0
Netpay                0
Porosity              0
Permeability          0
dtype: int64

In [None]:
train.columns
#Tectonic regime, Onshore/Offshore, Hydrocarbon type, Reservoir status, Structural setting, Period, Lithology - категориальные переменные!!

Index(['Tectonic regime', 'Onshore/Offshore', 'Hydrocarbon type',
       'Reservoir status', 'Structural setting', 'Depth', 'Period',
       'Lithology', 'Gross', 'Netpay', 'Porosity', 'Permeability'],
      dtype='object')

In [None]:
train['Hydrocarbon type'].value_counts()

OIL                235
GAS                 47
GAS-CONDENSATE      25
CARBON DIOXIDE       1
METHANE HYDRATE      1
Name: Hydrocarbon type, dtype: int64

In [None]:
train.shape

(309, 12)

# Подготовка данных

Обработаем категориальные признаки:
Tectonic regime, Onshore/Offshore, Hydrocarbon type, Reservoir status, Structural setting, Period, Lithology - категориальные переменные!!

Есть разные методы как это можно сделать:
- LabelEncoder()
- Get_dummies()
- OneHotVector()



In [None]:
train['Tectonic regime']
# в одной строчке несколько режимов, которые разделены /

0              STRIKE-SLIP/TRANSPRESSION/BASEMENT-I
1                       GRAVITY/EXTENSION/EVAPORITE
2                       GRAVITY/EXTENSION/EVAPORITE
3                                       COMPRESSION
4                   INVERSION/COMPRESSION/EXTENSION
                           ...                     
304    GRAVITY/EXTENSION/EVAPORITE/SYNSEDIMENTATION
305                 INVERSION/COMPRESSION/EXTENSION
306                           COMPRESSION/EVAPORITE
307       INVERSION/COMPRESSION/EXTENSION/EVAPORITE
308                                       EXTENSION
Name: Tectonic regime, Length: 309, dtype: object

In [None]:
train_full = train.copy()
train_full = train_full['Tectonic regime'].str.get_dummies('/')
train_full.columns = train_full.columns.str.split().str[0]
train_full = train_full.add_prefix('regime_').reset_index()
train_full.drop(['index'], axis=1, inplace=True)
train_full = pd.concat([train, train_full], axis=1)
train_full

Unnamed: 0,Tectonic regime,Onshore/Offshore,Hydrocarbon type,Reservoir status,Structural setting,Depth,Period,Lithology,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT
0,STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,OFFSHORE,OIL,DEVELOPING,INVERSION/WRENCH,3520,NEOGENE,SANDSTONE,2460.0,220.0,20.0,45.0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,GRAVITY/EXTENSION/EVAPORITE,OFFSHORE,OIL,MATURE PRODUCTION,SALT/PASSIVE MARGIN,9967,CRETACEOUS,LIMESTONE,427.0,160.0,19.0,175.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0
2,GRAVITY/EXTENSION/EVAPORITE,ONSHORE,OIL,MATURE PRODUCTION,PASSIVE MARGIN,8700,CRETACEOUS,LIMESTONE,95.0,15.0,12.0,20.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0
3,COMPRESSION,ONSHORE,OIL,DECLINING PRODUCTION,THRUST,5084,CRETACEOUS,SANDSTONE,328.0,300.0,13.0,600.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,INVERSION/COMPRESSION/EXTENSION,ONSHORE,OIL,DECLINING PRODUCTION,INVERSION/RIFT,1030,CRETACEOUS,SANDSTONE,260.0,33.0,24.0,182.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,GRAVITY/EXTENSION/EVAPORITE/SYNSEDIMENTATION,OFFSHORE,OIL,DECLINING PRODUCTION,DELTA/SUB-SALT/PASSIVE MARGIN,13265,NEOGENE,LOW-RESISTIVITY SANDSTONE,1500.0,295.0,29.0,1500.0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0
305,INVERSION/COMPRESSION/EXTENSION,OFFSHORE,OIL,DECLINING PRODUCTION,RIFT/PASSIVE MARGIN,1657,CRETACEOUS,LOW-RESISTIVITY SANDSTONE,164.0,98.0,32.0,7500.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
306,COMPRESSION/EVAPORITE,ONSHORE,OIL,CONTINUING DEVELOPMENT,FORELAND,10211,CRETACEOUS,CHALKY LIMESTONE,328.0,213.0,13.0,0.8,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
307,INVERSION/COMPRESSION/EXTENSION/EVAPORITE,OFFSHORE,GAS-CONDENSATE,PLATEAU PRODUCTION,SALT/RIFT,16360,JURASSIC,SANDSTONE,980.0,490.0,16.0,10.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0


In [None]:
tmp = train_full.copy()
tmp = tmp['Structural setting'].str.get_dummies('/')
tmp.columns = tmp.columns.str.split().str[0]
tmp = tmp.add_prefix('Structural setting_').reset_index()
tmp.drop(['index'], axis=1, inplace=True)
train_full = pd.concat([train_full, tmp], axis=1)
train_full

Unnamed: 0,Tectonic regime,Onshore/Offshore,Hydrocarbon type,Reservoir status,Structural setting,Depth,Period,Lithology,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT,index,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Structural setting_BACKARC.1,Structural setting_DELTA.1,Structural setting_FOREARC.1,Structural setting_FORELAND.1,Structural setting_INTRACRATONIC.1,Structural setting_INVERSION.1,Structural setting_PASSIVE.1,Structural setting_RIFT.1,Structural setting_SALT.1,Structural setting_SUB-SALT.1,Structural setting_SUB-THRUST.1,Structural setting_THRUST.1,Structural setting_WRENCH.1,Structural setting_BACKARC.2,Structural setting_DELTA.2,Structural setting_FOREARC.2,Structural setting_FORELAND.2,Structural setting_INTRACRATONIC.2,Structural setting_INVERSION.2,Structural setting_PASSIVE.2,Structural setting_RIFT.2,Structural setting_SALT.2,Structural setting_SUB-SALT.2,Structural setting_SUB-THRUST.2,Structural setting_THRUST.2,Structural setting_WRENCH.2
0,STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,OFFSHORE,OIL,DEVELOPING,INVERSION/WRENCH,3520,NEOGENE,SANDSTONE,2460.0,220.0,20.0,45.0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1
1,GRAVITY/EXTENSION/EVAPORITE,OFFSHORE,OIL,MATURE PRODUCTION,SALT/PASSIVE MARGIN,9967,CRETACEOUS,LIMESTONE,427.0,160.0,19.0,175.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
2,GRAVITY/EXTENSION/EVAPORITE,ONSHORE,OIL,MATURE PRODUCTION,PASSIVE MARGIN,8700,CRETACEOUS,LIMESTONE,95.0,15.0,12.0,20.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,COMPRESSION,ONSHORE,OIL,DECLINING PRODUCTION,THRUST,5084,CRETACEOUS,SANDSTONE,328.0,300.0,13.0,600.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,INVERSION/COMPRESSION/EXTENSION,ONSHORE,OIL,DECLINING PRODUCTION,INVERSION/RIFT,1030,CRETACEOUS,SANDSTONE,260.0,33.0,24.0,182.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,GRAVITY/EXTENSION/EVAPORITE/SYNSEDIMENTATION,OFFSHORE,OIL,DECLINING PRODUCTION,DELTA/SUB-SALT/PASSIVE MARGIN,13265,NEOGENE,LOW-RESISTIVITY SANDSTONE,1500.0,295.0,29.0,1500.0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,304,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0
305,INVERSION/COMPRESSION/EXTENSION,OFFSHORE,OIL,DECLINING PRODUCTION,RIFT/PASSIVE MARGIN,1657,CRETACEOUS,LOW-RESISTIVITY SANDSTONE,164.0,98.0,32.0,7500.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,305,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
306,COMPRESSION/EVAPORITE,ONSHORE,OIL,CONTINUING DEVELOPMENT,FORELAND,10211,CRETACEOUS,CHALKY LIMESTONE,328.0,213.0,13.0,0.8,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,306,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
307,INVERSION/COMPRESSION/EXTENSION/EVAPORITE,OFFSHORE,GAS-CONDENSATE,PLATEAU PRODUCTION,SALT/RIFT,16360,JURASSIC,SANDSTONE,980.0,490.0,16.0,10.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,307,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0


In [None]:
print(train_full['Hydrocarbon type'].value_counts())
print('===============================================')
print(train_full['Reservoir status'].value_counts())
print('===============================================')
print(train_full['Period'].value_counts())
print('===============================================')
print(train_full['Lithology'].value_counts())

OIL                235
GAS                 47
GAS-CONDENSATE      25
CARBON DIOXIDE       1
METHANE HYDRATE      1
Name: Hydrocarbon type, dtype: int64
DECLINING PRODUCTION        93
MATURE PRODUCTION           55
NEARLY DEPLETED             52
PLATEAU PRODUCTION          32
DEVELOPING                  21
REJUVENATING                21
UNKNOWN                     12
UNDEVELOPED                  7
CONTINUING DEVELOPMENT       6
SECOND PLATEAU PRODUTION     5
ABANDONED                    4
DEPLETED                     1
Name: Reservoir status, dtype: int64
CRETACEOUS                  83
NEOGENE                     45
JURASSIC                    41
PALEOGENE                   34
CARBONIFEROUS               25
PERMIAN                     22
DEVONIAN                    16
TRIASSIC                    10
CRETACEOUS-PALEOGENE         8
PROTEROZOIC                  5
CARBONIFEROUS-PERMIAN        4
PALEOGENE-NEOGENE            3
TRIASSIC-JURASSIC            2
ORDOVICIAN                   2
JURAS

In [None]:
train_full = pd.get_dummies(train_full, columns=['Hydrocarbon type', 'Reservoir status', 'Period', 'Lithology'])
train_full

Unnamed: 0,Tectonic regime,Onshore/Offshore,Structural setting,Depth,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT,index,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Structural setting_BACKARC.1,Structural setting_DELTA.1,...,Reservoir status_UNDEVELOPED,Reservoir status_UNKNOWN,Period_ARCHEAN,Period_CAMBRIAN,Period_CAMBRIAN-ORDOVICIAN,Period_CARBONIFEROUS,Period_CARBONIFEROUS-CRETACEOUS,Period_CARBONIFEROUS-PERMIAN,Period_CRETACEOUS,Period_CRETACEOUS-PALEOGENE,Period_DEVONIAN,Period_JURASSIC,Period_JURASSIC-CRETACEOUS,Period_MESOZOIC,Period_NEOGENE,Period_ORDOVICIAN,Period_PALEOGENE,Period_PALEOGENE-NEOGENE,Period_PALEOZOIC,Period_PERMIAN,Period_PROTEROZOIC,Period_PROTEROZOIC-CAMBRIAN,Period_TRIASSIC,Period_TRIASSIC-JURASSIC,Lithology_BASEMENT,Lithology_CHALK,Lithology_CHALKY LIMESTONE,Lithology_CHERT,Lithology_CONGLOMERATE,Lithology_DIATOMITE,Lithology_DOLOMITE,Lithology_DOLOMITIC LIMESTONE,Lithology_LIMESTONE,Lithology_LOW-RESISTIVITY SANDSTONE,Lithology_SANDSTONE,Lithology_SHALE,Lithology_SHALY SANDSTONE,Lithology_SILTSTONE,Lithology_THINLY-BEDDED SANDSTONE,Lithology_VOLCANICS
0,STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,OFFSHORE,INVERSION/WRENCH,3520,2460.0,220.0,20.0,45.0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,GRAVITY/EXTENSION/EVAPORITE,OFFSHORE,SALT/PASSIVE MARGIN,9967,427.0,160.0,19.0,175.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,GRAVITY/EXTENSION/EVAPORITE,ONSHORE,PASSIVE MARGIN,8700,95.0,15.0,12.0,20.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,COMPRESSION,ONSHORE,THRUST,5084,328.0,300.0,13.0,600.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,INVERSION/COMPRESSION/EXTENSION,ONSHORE,INVERSION/RIFT,1030,260.0,33.0,24.0,182.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,GRAVITY/EXTENSION/EVAPORITE/SYNSEDIMENTATION,OFFSHORE,DELTA/SUB-SALT/PASSIVE MARGIN,13265,1500.0,295.0,29.0,1500.0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,304,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
305,INVERSION/COMPRESSION/EXTENSION,OFFSHORE,RIFT/PASSIVE MARGIN,1657,164.0,98.0,32.0,7500.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,305,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
306,COMPRESSION/EVAPORITE,ONSHORE,FORELAND,10211,328.0,213.0,13.0,0.8,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,306,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
307,INVERSION/COMPRESSION/EXTENSION/EVAPORITE,OFFSHORE,SALT/RIFT,16360,980.0,490.0,16.0,10.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,307,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
train_full.drop(['Tectonic regime', 'Structural setting'], axis=1, inplace=True)
train_full

Unnamed: 0,Onshore/Offshore,Depth,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT,index,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Structural setting_BACKARC.1,Structural setting_DELTA.1,Structural setting_FOREARC.1,Structural setting_FORELAND.1,...,Reservoir status_UNDEVELOPED,Reservoir status_UNKNOWN,Period_ARCHEAN,Period_CAMBRIAN,Period_CAMBRIAN-ORDOVICIAN,Period_CARBONIFEROUS,Period_CARBONIFEROUS-CRETACEOUS,Period_CARBONIFEROUS-PERMIAN,Period_CRETACEOUS,Period_CRETACEOUS-PALEOGENE,Period_DEVONIAN,Period_JURASSIC,Period_JURASSIC-CRETACEOUS,Period_MESOZOIC,Period_NEOGENE,Period_ORDOVICIAN,Period_PALEOGENE,Period_PALEOGENE-NEOGENE,Period_PALEOZOIC,Period_PERMIAN,Period_PROTEROZOIC,Period_PROTEROZOIC-CAMBRIAN,Period_TRIASSIC,Period_TRIASSIC-JURASSIC,Lithology_BASEMENT,Lithology_CHALK,Lithology_CHALKY LIMESTONE,Lithology_CHERT,Lithology_CONGLOMERATE,Lithology_DIATOMITE,Lithology_DOLOMITE,Lithology_DOLOMITIC LIMESTONE,Lithology_LIMESTONE,Lithology_LOW-RESISTIVITY SANDSTONE,Lithology_SANDSTONE,Lithology_SHALE,Lithology_SHALY SANDSTONE,Lithology_SILTSTONE,Lithology_THINLY-BEDDED SANDSTONE,Lithology_VOLCANICS
0,OFFSHORE,3520,2460.0,220.0,20.0,45.0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,OFFSHORE,9967,427.0,160.0,19.0,175.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,ONSHORE,8700,95.0,15.0,12.0,20.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,ONSHORE,5084,328.0,300.0,13.0,600.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,ONSHORE,1030,260.0,33.0,24.0,182.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,OFFSHORE,13265,1500.0,295.0,29.0,1500.0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,304,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
305,OFFSHORE,1657,164.0,98.0,32.0,7500.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,305,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
306,ONSHORE,10211,328.0,213.0,13.0,0.8,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,306,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
307,OFFSHORE,16360,980.0,490.0,16.0,10.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,307,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
train_full['Onshore/Offshore'] = labelencoder.fit_transform(train_full['Onshore/Offshore'])
train_full

Unnamed: 0,Onshore/Offshore,Depth,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT,index,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Structural setting_BACKARC.1,Structural setting_DELTA.1,Structural setting_FOREARC.1,Structural setting_FORELAND.1,...,Reservoir status_UNDEVELOPED,Reservoir status_UNKNOWN,Period_ARCHEAN,Period_CAMBRIAN,Period_CAMBRIAN-ORDOVICIAN,Period_CARBONIFEROUS,Period_CARBONIFEROUS-CRETACEOUS,Period_CARBONIFEROUS-PERMIAN,Period_CRETACEOUS,Period_CRETACEOUS-PALEOGENE,Period_DEVONIAN,Period_JURASSIC,Period_JURASSIC-CRETACEOUS,Period_MESOZOIC,Period_NEOGENE,Period_ORDOVICIAN,Period_PALEOGENE,Period_PALEOGENE-NEOGENE,Period_PALEOZOIC,Period_PERMIAN,Period_PROTEROZOIC,Period_PROTEROZOIC-CAMBRIAN,Period_TRIASSIC,Period_TRIASSIC-JURASSIC,Lithology_BASEMENT,Lithology_CHALK,Lithology_CHALKY LIMESTONE,Lithology_CHERT,Lithology_CONGLOMERATE,Lithology_DIATOMITE,Lithology_DOLOMITE,Lithology_DOLOMITIC LIMESTONE,Lithology_LIMESTONE,Lithology_LOW-RESISTIVITY SANDSTONE,Lithology_SANDSTONE,Lithology_SHALE,Lithology_SHALY SANDSTONE,Lithology_SILTSTONE,Lithology_THINLY-BEDDED SANDSTONE,Lithology_VOLCANICS
0,0,3520,2460.0,220.0,20.0,45.0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,9967,427.0,160.0,19.0,175.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,1,8700,95.0,15.0,12.0,20.0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,5084,328.0,300.0,13.0,600.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,1,1030,260.0,33.0,24.0,182.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,0,13265,1500.0,295.0,29.0,1500.0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,0,0,304,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
305,0,1657,164.0,98.0,32.0,7500.0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,305,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
306,1,10211,328.0,213.0,13.0,0.8,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,306,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
307,0,16360,980.0,490.0,16.0,10.0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,307,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


# **Балансировка данных**

В данной работе использовали метод oversampling c помощью SMOTE.

In [None]:
train_full['Onshore/Offshore'].value_counts()

1    211
0     93
2      5
Name: Onshore/Offshore, dtype: int64

In [None]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0, k_neighbors=3)

X_train_full = train_full.drop(['Onshore/Offshore'], axis=1)
y_train_full = train_full['Onshore/Offshore']

column = X_train_full.columns

print("До балансировки")
print(X_train_full.shape)
print(y_train_full.value_counts())

os_data_X, os_data_y = os.fit_sample(X_train_full, y_train_full)
os_data_X = pd.DataFrame(data=os_data_X, columns=column)
os_data_y = pd.DataFrame(data=os_data_y, columns=['Onshore/Offshore'])

print("После балансировки")
print(os_data_X.shape)
print(os_data_y.value_counts())

До балансировки
(309, 116)
1    211
0     93
2      5
Name: Onshore/Offshore, dtype: int64
После балансировки
(633, 116)
Onshore/Offshore
2                   211
1                   211
0                   211
dtype: int64




In [None]:
os_data_X

Unnamed: 0,Depth,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_REACTIVATION,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,regime_UPLIFT,index,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Structural setting_BACKARC.1,Structural setting_DELTA.1,Structural setting_FOREARC.1,Structural setting_FORELAND.1,Structural setting_INTRACRATONIC.1,...,Reservoir status_UNDEVELOPED,Reservoir status_UNKNOWN,Period_ARCHEAN,Period_CAMBRIAN,Period_CAMBRIAN-ORDOVICIAN,Period_CARBONIFEROUS,Period_CARBONIFEROUS-CRETACEOUS,Period_CARBONIFEROUS-PERMIAN,Period_CRETACEOUS,Period_CRETACEOUS-PALEOGENE,Period_DEVONIAN,Period_JURASSIC,Period_JURASSIC-CRETACEOUS,Period_MESOZOIC,Period_NEOGENE,Period_ORDOVICIAN,Period_PALEOGENE,Period_PALEOGENE-NEOGENE,Period_PALEOZOIC,Period_PERMIAN,Period_PROTEROZOIC,Period_PROTEROZOIC-CAMBRIAN,Period_TRIASSIC,Period_TRIASSIC-JURASSIC,Lithology_BASEMENT,Lithology_CHALK,Lithology_CHALKY LIMESTONE,Lithology_CHERT,Lithology_CONGLOMERATE,Lithology_DIATOMITE,Lithology_DOLOMITE,Lithology_DOLOMITIC LIMESTONE,Lithology_LIMESTONE,Lithology_LOW-RESISTIVITY SANDSTONE,Lithology_SANDSTONE,Lithology_SHALE,Lithology_SHALY SANDSTONE,Lithology_SILTSTONE,Lithology_THINLY-BEDDED SANDSTONE,Lithology_VOLCANICS
0,3520.000000,2460.000000,220.000000,20.000000,45.000000,1.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,1.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0
1,9967.000000,427.000000,160.000000,19.000000,175.000000,0.000000,0.000000,0.0,0.000000,1.0,1.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,1.0,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
2,8700.000000,95.000000,15.000000,12.000000,20.000000,0.000000,0.000000,0.0,0.000000,1.0,1.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,2.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
3,5084.000000,328.000000,300.000000,13.000000,600.000000,0.000000,1.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,3.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0
4,1030.000000,260.000000,33.000000,24.000000,182.000000,0.000000,1.000000,0.0,0.000000,0.0,1.000000,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,4.000000,0.0,0.0,0.0,0.000000,0.0,1.000000,0.0,1.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,2820.799368,770.284001,354.500500,26.008620,354.315399,0.882860,0.117140,0.0,0.117140,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.882860,0.0,0.882860,0.0,0.0,184.160240,0.0,0.0,0.0,0.117140,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.882860,0.0,0.0,0.0,0.117140,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117140,0.0,0.0,0.0,0.0,0.0,0.882860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117140,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.882860,0.0
629,6150.006139,1105.767392,187.468864,12.754359,32.556045,0.000000,1.000000,0.0,0.749377,0.0,0.250623,0.0,0.250623,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,139.573481,0.0,0.0,0.0,0.749377,0.0,0.250623,0.0,0.250623,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.749377,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.749377,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.749377,0.0,0.0,0.0,0.250623,0.0,0.0,0.0,0.000000,0.0
630,6035.149280,1049.994168,183.870591,13.258117,39.032935,0.000000,1.000000,0.0,0.677412,0.0,0.322588,0.0,0.322588,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,148.065404,0.0,0.0,0.0,0.677412,0.0,0.322588,0.0,0.322588,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.677412,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.677412,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.677412,0.0,0.0,0.0,0.322588,0.0,0.0,0.0,0.000000,0.0
631,2824.205488,666.824216,332.345421,26.104241,343.127227,0.810424,0.189576,0.0,0.000000,0.0,0.189576,0.0,0.189576,0.0,0.0,0.0,0.810424,0.0,0.810424,0.0,0.0,200.445581,0.0,0.0,0.0,0.000000,0.0,0.189576,0.0,0.189576,0.0,0.0,0.0,0.0,0.810424,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.810424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.189576,0.0,0.0,0.0,0.810424,0.0


# Еще одна обработка признаков

Нормализуем наши данные
- MinMaxScaler
- StandartScaler
и т д

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(os_data_X)
os_data_X_norm = scaler.transform(os_data_X)

os_data_X_norm

array([[0.18508132, 0.2334827 , 0.07392473, ..., 0.        , 0.        ,
        0.        ],
       [0.54666293, 0.0396606 , 0.05376344, ..., 0.        , 0.        ,
        0.        ],
       [0.47560292, 0.00800839, 0.00504032, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.3261441 , 0.0990556 , 0.06178447, ..., 0.        , 0.        ,
        0.        ],
       [0.14605751, 0.06252495, 0.11167521, ..., 0.        , 0.81042409,
        0.        ],
       [0.17936881, 0.04797372, 0.04604689, ..., 0.        , 0.        ,
        0.        ]])

# Строим нашу модель

Будем использовать метод опорных векторов

In [None]:
# os_data_X_norm - это вектор признаков (все сбалансировано, нормализовано и обработано)
# os_data_y - вевтор ответов

from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(os_data_X_norm, os_data_y, test_size=0.3, random_state=1)

SVC_model = SVC()
SVC_model.fit(X_train, y_train)
SCV_predict = SVC_model.predict(X_test)

print(confusion_matrix(y_test, SCV_predict))
print(accuracy_score(y_test, SCV_predict))

# GridSearch, RandomSearch, RFE, KFold и cross_val_score

[[58  5  0]
 [ 9 51  0]
 [ 0  0 67]]
0.9263157894736842


  y = column_or_1d(y, warn=True)


# Тестовый набор данных.

In [None]:
test_full = test.copy()
test_full = test_full['Tectonic regime'].str.get_dummies('/')
test_full.columns = test_full.columns.str.split().str[0]
test_full = test_full.add_prefix('regime_').reset_index()
test_full.drop(['index'], axis=1, inplace=True)
test_full = pd.concat([test, test_full], axis=1)

In [None]:
эtmp = test_full.copy()
tmp = tmp['Structural setting'].str.get_dummies('/')
tmp.columns = tmp.columns.str.split().str[0]
tmp = tmp.add_prefix('Structural setting_').reset_index()
tmp.drop(['index'], axis=1, inplace=True)
test_full = pd.concat([test_full, tmp], axis=1)

In [None]:
test_full = pd.get_dummies(test_full, columns=['Hydrocarbon type', 'Reservoir status', 'Period', 'Lithology'])
test_full.drop(['Tectonic regime', 'Structural setting'], axis=1, inplace=True)

test_full

Unnamed: 0,Depth,Gross,Netpay,Porosity,Permeability,regime_BASEMENT-I,regime_COMPRESSION,regime_DIAPIR,regime_EROSION,regime_EVAPORITE,regime_EXTENSION,regime_GRAVITY,regime_INVERSION,regime_LINKED,regime_SHALE,regime_STRIKE-SLIP,regime_SYNSEDIMENTATION,regime_TRANSPRESSION,regime_TRANSTENSION,Structural setting_BACKARC,Structural setting_DELTA,Structural setting_FOREARC,Structural setting_FORELAND,Structural setting_INTRACRATONIC,Structural setting_INVERSION,Structural setting_PASSIVE,Structural setting_RIFT,Structural setting_SALT,Structural setting_SUB-SALT,Structural setting_SUB-THRUST,Structural setting_THRUST,Structural setting_WRENCH,Hydrocarbon type_BITUMEN,Hydrocarbon type_CARBON DIOXIDE,Hydrocarbon type_GAS,Hydrocarbon type_GAS-CONDENSATE,Hydrocarbon type_OIL,Reservoir status_ABANDONED,Reservoir status_CONTINUING DEVELOPMENT,Reservoir status_DECLINING PRODUCTION,...,Reservoir status_MATURE PRODUCTION,Reservoir status_NEARLY DEPLETED,Reservoir status_PLATEAU PRODUCTION,Reservoir status_REJUVENATING,Reservoir status_SECOND PLATEAU PRODUTION,Reservoir status_TEMPORARILY SHUT-IN,Reservoir status_UNDEVELOPED,Reservoir status_UNKNOWN,Period_CAMBRIAN-ORDOVICIAN,Period_CAMBRIAN-ORDOVICIAN/CARBONIFEROUS,Period_CARBONIFEROUS,Period_CARBONIFEROUS-PERMIAN,Period_CRETACEOUS,Period_CRETACEOUS-PALEOGENE,Period_DEVONIAN,Period_DEVONIAN-CARBONIFEROUS,Period_DEVONIAN-PERMIAN,Period_JURASSIC,Period_NEOGENE,Period_ORDOVICIAN,Period_PALEOGENE,Period_PALEOGENE-NEOGENE,Period_PALEOZOIC-CRETACEOUS,Period_PERMIAN,Period_PERMIAN-TRIASSIC,Period_PROTEROZOIC,Period_SILURIAN,Period_TRIASSIC,Period_TRIASSIC-JURASSIC,Lithology_CHALK,Lithology_CHALKY LIMESTONE,Lithology_DOLOMITE,Lithology_DOLOMITIC LIMESTONE,Lithology_GLAUCONITIC SANDSTONE,Lithology_LIMESTONE,Lithology_LOW-RESISTIVITY SANDSTONE,Lithology_SANDSTONE,Lithology_SHALY SANDSTONE,Lithology_SILTSTONE,Lithology_THINLY-BEDDED SANDSTONE
0,2275,325.0,30.0,13.0,0.04,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,500,40.0,18.0,28.0,240.00,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,5548,200.0,20.0,13.0,7.30,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,10100,8200.0,260.0,18.0,100.00,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,8750,140.0,70.0,12.0,125.00,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,5520,630.0,394.0,26.0,1000.00,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
129,1500,100.0,82.0,28.0,440.00,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
130,11100,200.0,150.0,20.0,75.00,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
131,3939,410.0,20.0,28.0,1000.00,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


Для набора данных train после всех преобразований у нас получилось 116 признаков.
Для набора данных test после всех преобрзований у нас получилс 81 признак.

In [None]:
train_cols = train_full.columns
test_cols = test_full.columns

print(len(train_cols))
print(len(test_cols))

cols = set(train_cols) & set(test_cols)
print(len(cols)) # cols - общие признаки которые есть и в train и в test

117
81
72


In [None]:
train_full_72 = X_train_full[cols]
train_full_72.shape

(309, 98)

In [None]:
os_data_X_72 = train_full[cols] #возьмем из сбалансорованных данных только те столбцы которые есть и в train и в test
test_full_72 = test_full[cols]
#для os_data_X_72 правильные ответы лежат в os_data_y
# здесь нет нормализации ни для train ни для test!!!!!

SVC_model = SVC()
SVC_model.fit(os_data_X_72, os_data_y)
ansewrs = SVC_model.predict(test_full_72)

ansewrs

  y = column_or_1d(y, warn=True)


ValueError: ignored