In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [232]:
class MulticlassClassification(nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 528)
        self.layer_2 = nn.Linear(528, 256)
        self.layer_3 = nn.Linear(256, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.7)
        self.batchnorm1 = nn.BatchNorm1d(528)
        self.batchnorm2 = nn.BatchNorm1d(256)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x


# GOOGLE
# class MulticlassClassification(nn.Module):
#     def __init__(self, num_feature, num_class):
#         super(MulticlassClassification, self).__init__()
        
#         self.layer_1 = nn.Linear(num_feature, 256)
#         self.layer_2 = nn.Linear(256, 128)
#         self.layer_3 = nn.Linear(128, 64)
#         self.layer_out = nn.Linear(64, num_class) 
        
#         self.relu = nn.ReLU()
#         self.dropout = nn.Dropout(p=0.5)
#         self.batchnorm1 = nn.BatchNorm1d(256)
#         self.batchnorm2 = nn.BatchNorm1d(128)
#         self.batchnorm3 = nn.BatchNorm1d(64)
        
#     def forward(self, x):
#         x = self.layer_1(x)
#         x = self.batchnorm1(x)
#         x = self.relu(x)
        
#         x = self.layer_2(x)
#         x = self.batchnorm2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
        
#         x = self.layer_3(x)
#         x = self.batchnorm3(x)
#         x = self.relu(x)
#         x = self.dropout(x)
        
#         x = self.layer_out(x)
        
#         return x

In [233]:
train_dataset = pd.read_csv('../Train-dataset.csv')
train_dataset.head()

Unnamed: 0,WELL,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_NAME,LITH_CODE
0,Well-1,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,Marly sandstone,1200
1,Well-1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,Marly sandstone,1200
2,Well-1,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,Marly sandstone,1200
3,Well-1,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,Sandy marl,1300
4,Well-1,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,Sandy marl,1300


In [234]:
train_dataset = train_dataset.drop(['WELL', 'LITH_NAME'], axis=1)
train_dataset.head()

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,DEPOSITIONAL_ENVIRONMENT,LITH_CODE
0,0.8179,2.9814,1602.0,83.939,3.166,0.25,2.344,Marine,1200
1,0.8179,2.9814,1602.1,84.166,3.135,0.246,2.352,Marine,1200
2,0.8179,2.9814,1602.2,85.055,3.089,0.244,2.352,Marine,1200
3,0.8179,2.9814,1602.3,86.352,3.042,0.242,2.355,Marine,1300
4,0.8179,2.9814,1602.4,87.614,3.003,0.241,2.369,Marine,1300


In [235]:
ohe = OneHotEncoder(sparse=False)
new_ohe_features = ohe.fit_transform(train_dataset.DEPOSITIONAL_ENVIRONMENT.values.reshape(-1,1))
tmp = pd.DataFrame(new_ohe_features, columns=['ENV' + 
                                             str(i) for i in range(new_ohe_features.shape[1])])
train_dataset = pd.concat([train_dataset, tmp], axis=1)
train_dataset = train_dataset.drop(['DEPOSITIONAL_ENVIRONMENT'], axis=1)
cols = train_dataset.columns.tolist()
cols = cols[0:7] + cols[8:11] + cols[7:8]
train_dataset = train_dataset[cols]
train_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
0,0.8179,2.9814,1602.0,83.9390,3.1660,0.25000,2.3440,0.0,1.0,0.0,1200
1,0.8179,2.9814,1602.1,84.1660,3.1350,0.24600,2.3520,0.0,1.0,0.0,1200
2,0.8179,2.9814,1602.2,85.0550,3.0890,0.24400,2.3520,0.0,1.0,0.0,1200
3,0.8179,2.9814,1602.3,86.3520,3.0420,0.24200,2.3550,0.0,1.0,0.0,1300
4,0.8179,2.9814,1602.4,87.6140,3.0030,0.24100,2.3690,0.0,1.0,0.0,1300
...,...,...,...,...,...,...,...,...,...,...,...
45744,3.0000,0.0000,2275.2,103.6016,1.0789,0.29558,2.3783,0.0,1.0,0.0,400
45745,3.0000,0.0000,2275.3,102.8472,1.0683,0.29264,2.3651,0.0,1.0,0.0,400
45746,3.0000,0.0000,2275.4,102.5699,1.0790,0.29425,2.3531,0.0,1.0,0.0,400
45747,3.0000,0.0000,2275.5,102.7901,1.1045,0.30096,2.3430,0.0,1.0,0.0,400


In [236]:
X = train_dataset.iloc[:,0:-1]
y = train_dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.67,
                                                    random_state=6,
                                                    stratify=y)

In [237]:
scaler = preprocessing.StandardScaler().fit(X_train)
train_dataset.iloc[:,0:-1] = scaler.transform(train_dataset.iloc[:,0:-1])
train_dataset

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
0,-0.837657,1.409368,0.808620,-0.058873,-0.029915,-1.251057,0.933553,-0.794124,2.082126,-0.861181,1200
1,-0.837657,1.409368,0.808879,-0.044573,-0.030352,-1.303434,0.986150,-0.794124,2.082126,-0.861181,1200
2,-0.837657,1.409368,0.809139,0.011430,-0.031001,-1.329622,0.986150,-0.794124,2.082126,-0.861181,1200
3,-0.837657,1.409368,0.809398,0.093134,-0.031663,-1.355811,1.005873,-0.794124,2.082126,-0.861181,1300
4,-0.837657,1.409368,0.809657,0.172634,-0.032213,-1.368905,1.097918,-0.794124,2.082126,-0.861181,1300
...,...,...,...,...,...,...,...,...,...,...,...
45744,1.311023,-1.301154,2.553478,1.179770,-0.059345,-0.654222,1.159061,-0.794124,2.082126,-0.861181,400
45745,1.311023,-1.301154,2.553737,1.132247,-0.059495,-0.692719,1.072277,-0.794124,2.082126,-0.861181,400
45746,1.311023,-1.301154,2.553996,1.114778,-0.059344,-0.671637,0.993382,-0.794124,2.082126,-0.861181,400
45747,1.311023,-1.301154,2.554255,1.128650,-0.058984,-0.583775,0.926978,-0.794124,2.082126,-0.861181,400


In [238]:
X = train_dataset.iloc[:,0:-1]
y = train_dataset.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.67,
                                                    random_state=6,
                                                    stratify=y)

In [239]:
train = pd.concat([X_train, y_train], axis=1)
train

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,0.727768,-0.794124,2.082126,-0.861181,300
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,1.599559,-0.794124,2.082126,-0.861181,400
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,-0.418840,1.259250,-0.480278,-0.861181,300
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,0.605481,-0.794124,-0.480278,1.161196,1300
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,-0.829094,1.259250,-0.480278,-0.861181,600
...,...,...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,0.660366,-0.794124,-0.480278,1.161196,300
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,0.085431,1.259250,-0.480278,-0.861181,100
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,0.274122,-0.794124,-0.480278,1.161196,1300
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,-0.543099,1.259250,-0.480278,-0.861181,100


In [240]:
test = pd.concat([X_test, y_test], axis=1)
test

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
29062,-0.352207,0.600048,0.875965,0.961902,-0.030282,-0.725193,1.456035,-0.794124,2.082126,-0.861181,400
4386,-1.038729,1.426278,-1.049762,-0.598329,-0.014223,0.440802,-0.381082,1.259250,-0.480278,-0.861181,500
20596,-0.352207,0.600048,-1.318325,0.885488,-0.030683,2.195082,-1.360649,1.259250,-0.480278,-0.861181,100
11424,-0.898215,0.501951,-0.665385,-0.839883,0.037464,-1.379380,-1.820542,1.259250,-0.480278,-0.861181,600
10682,-0.898215,0.501951,-0.857703,-0.602732,0.022184,-0.959841,-0.499050,1.259250,-0.480278,-0.861181,500
...,...,...,...,...,...,...,...,...,...,...,...
25615,-0.352207,0.600048,-0.017458,-2.291862,-0.014963,0.474107,-1.075707,-0.794124,-0.480278,1.161196,600
39221,1.311023,-1.301154,0.862791,0.458970,-0.038515,0.114409,0.585100,-0.794124,-0.480278,1.161196,100
43077,1.311023,-1.301154,1.862222,1.172186,-0.058252,0.578076,0.802061,-0.794124,2.082126,-0.861181,300
9505,-0.898215,0.501951,-1.162768,-0.893120,0.018092,-0.177723,-0.309701,1.259250,-0.480278,-0.861181,600


In [241]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import sys
import torchvision
torch.backends.cudnn.enabled = False

In [242]:
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

device = get_device()
print(device)   
print(torch.cuda.is_available())
print(torch.cuda.device_count())

cuda:0
True
1


In [243]:
class2idx = {
    100:0,
    200:1,
    300:2,
    400:3,
    500:4,
    600:5,
    800:6,
    1000:7,
    1100:8,
    1200:9,
    1300:10,
    1400:11,
    1500:12
}

idx2class = {v: k for k, v in class2idx.items()}
train['LITH_CODE'].replace(class2idx, inplace=True)
train


Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
45566,1.311023,-1.301154,2.507342,1.855667,-0.053474,0.107469,0.727768,-0.794124,2.082126,-0.861181,2
43776,1.311023,-1.301154,2.043395,0.649467,-0.015290,-1.126138,1.599559,-0.794124,2.082126,-0.861181,3
12212,-0.898215,0.501951,-0.461145,-0.175186,0.010588,-0.712491,-0.418840,1.259250,-0.480278,-0.861181,2
39472,1.311023,-1.301154,0.927847,-0.109842,-0.023590,-0.704373,0.605481,-0.794124,-0.480278,1.161196,10
29800,1.311023,-1.301154,-1.579025,-0.661507,0.007375,0.602693,-0.829094,1.259250,-0.480278,-0.861181,5
...,...,...,...,...,...,...,...,...,...,...,...
7300,-1.038729,1.426278,-0.293709,0.392001,-0.029005,0.038597,0.660366,-0.794124,-0.480278,1.161196,2
34800,1.311023,-1.301154,-0.283082,0.839264,-0.026998,-0.167772,0.085431,1.259250,-0.480278,-0.861181,0
14313,-0.898215,0.501951,0.083411,0.238337,-0.025335,-0.431358,0.274122,-0.794124,-0.480278,1.161196,10
9884,-0.898215,0.501951,-1.064536,0.981141,-0.029764,0.485761,-0.543099,1.259250,-0.480278,-0.861181,0


In [244]:
test['LITH_CODE'].replace(class2idx, inplace=True)
test

Unnamed: 0,X,Y,MD,GR,RT,CN,DEN,ENV0,ENV1,ENV2,LITH_CODE
29062,-0.352207,0.600048,0.875965,0.961902,-0.030282,-0.725193,1.456035,-0.794124,2.082126,-0.861181,3
4386,-1.038729,1.426278,-1.049762,-0.598329,-0.014223,0.440802,-0.381082,1.259250,-0.480278,-0.861181,4
20596,-0.352207,0.600048,-1.318325,0.885488,-0.030683,2.195082,-1.360649,1.259250,-0.480278,-0.861181,0
11424,-0.898215,0.501951,-0.665385,-0.839883,0.037464,-1.379380,-1.820542,1.259250,-0.480278,-0.861181,5
10682,-0.898215,0.501951,-0.857703,-0.602732,0.022184,-0.959841,-0.499050,1.259250,-0.480278,-0.861181,4
...,...,...,...,...,...,...,...,...,...,...,...
25615,-0.352207,0.600048,-0.017458,-2.291862,-0.014963,0.474107,-1.075707,-0.794124,-0.480278,1.161196,5
39221,1.311023,-1.301154,0.862791,0.458970,-0.038515,0.114409,0.585100,-0.794124,-0.480278,1.161196,0
43077,1.311023,-1.301154,1.862222,1.172186,-0.058252,0.578076,0.802061,-0.794124,2.082126,-0.861181,2
9505,-0.898215,0.501951,-1.162768,-0.893120,0.018092,-0.177723,-0.309701,1.259250,-0.480278,-0.861181,5


In [245]:
train_tensor = torch.tensor(train.values).to(device)
test_tensor = torch.tensor(test.values).to(device)

In [246]:
model_upgrade = torch.load('../solution/finish/multiclass.pth')
model_upgrade.load_state_dict(torch.load('../solution/finish/MulticlassClassification.pth'))
model_upgrade.eval()

MulticlassClassification(
  (layer_1): Linear(in_features=10, out_features=528, bias=True)
  (layer_2): Linear(in_features=528, out_features=256, bias=True)
  (layer_3): Linear(in_features=256, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=13, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.7, inplace=False)
  (batchnorm1): BatchNorm1d(528, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm3): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [247]:
model_upgrade = model_upgrade.to(device)

In [248]:
NUM_FEATURES = len(test.columns)-1
def net_prediction(test_str):
    tensor_params = torch.tensor([test_str.values]).to(device)
    output = model_upgrade(tensor_params.double())
    return torch.max(output, 1)[1].to('cpu').numpy()[0]

In [249]:
true = []
predict = []
for str_name, data in test.iterrows():
    data_target = data[-1]
    data_features = data[0:-1]
    true.append(data_target)
    predict.append(net_prediction(data_features))


In [250]:
from sklearn.metrics import f1_score
f1_score(true, predict, average = 'macro')

0.6961311177292983

In [118]:
lith_code = pd.DataFrame(predict)
lith_code

Unnamed: 0,0
0,3
1,4
2,0
3,5
4,4
...,...
15093,5
15094,0
15095,2
15096,5


In [120]:
idx2class = {
    0:100,
    1:200,
    2:300,
    3:400,
    4:500,
    5:600,
    6:800,
    7:1000,
    8:1100,
    9:1200,
    10:1300,
    11:1400,
    12:1500
}

idx2class = {v: k for k, v in class2idx.items()}
lith_code.replace(idx2class, inplace=True)
lith_code


Unnamed: 0,0
0,400
1,500
2,100
3,600
4,500
...,...
15093,600
15094,100
15095,300
15096,600


In [137]:
a = np.asarray(lith_code.values)
np.savetxt("foo.csv", a, delimiter=",",fmt='%1.f',encoding='utf-8')

In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rfc = RandomForestClassifier()
rfc.fit(X_corr_train, y_corr_train)
rfc.predict(X_corr_test)

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'