# Import outside libraries:

In [1]:
# data I/O
import pandas as pd

# neuron network
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn

# data standardization
from sklearn import preprocessing

# Construct dataset:

In [2]:
# 1. read origin data
def readData():
    dt = pd.read_csv('demofile.csv',encoding='gbk')
    # drop some useless column
    dt.drop(['xxx1','xxx2','xxx3'],axis=1,inplace=True)
    return dt
dt = readData()
# choose some useful column
dt = dt[['xxx4','xxx5']]

In [3]:
# 2. construct training set & validation set:
# construct labels
labels = np.array(dt['xxx'])
# construct features
features = np.array(dt.drop('xxx', axis=1))
# standardization
inputFeatures = preprocessing.StandardScaler().fit_transform(features)
# transform data type of labels and features from np to torch
x = torch.tensor(inputFeatures, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.int64)

dataScale = x.shape[0]   # size
valScale = int(0.2 * dataScale)   # 20% data for validation set
shuffledIndices = torch.randperm(dataScale)   # shuffle all of the data

trainIndices = shuffledIndices[:-valScale]   # indices of train data
validationIndices = shuffledIndices[-valScale:]   # indices of validation data

trainLabels = y[trainIndices]
trainFeatures = x[trainIndices]   # training set

validateLabels = y[validationIndices]
validateFeatures = x[validationIndices]   # validation set

ps: Because of the imbalance between different kinds of training data<br>
(the num of normal is 5171, warning is 314 (1/17 times of normal), landslide is 552 (1/10 times of normal))

In [4]:
# 3. balance different kinds of training data
warningIndices = []
landslideIndices  =[]

for i in range(len(trainLabels)):
    if trainLabels[i] == 1:
        warningIndices.append(shuffledIndices[i])
    elif trainLabels[i] == 2:
        landslideIndices.append(shuffledIndices[i])
    else:
        continue

wl = y[warningIndices]
wf = x[warningIndices]
ll = y[landslideIndices]
lf = x[landslideIndices]

twl = torch.cat((wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl, wl), 0)
twf = torch.cat((wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf, wf), 0)
tll = torch.cat((ll, ll, ll, ll, ll, ll, ll, ll, ll), 0)
tlf = torch.cat((lf, lf, lf, lf, lf, lf, lf, lf, lf), 0)

trainLabels = torch.cat((trainLabels, twl, tll), 0)
trainFeatures = torch.cat((trainFeatures, twf, tlf), 0)

si = torch.randperm(trainLabels.shape[0])
trainLabels = trainLabels[si]
trainFeatures = trainFeatures[si]

In [5]:
# 4. move to gpu
x = x.cuda()
y = y.cuda()
trainLabels = trainLabels.cuda()
trainFeatures = trainFeatures.cuda()
validateLabels = validateLabels.cuda()
validateFeatures = validateFeatures.cuda()

# Define neuron network and its params：

In [6]:
# 1. define network for classification
class ClassifyNetwork(nn.Module):
    def __init__(self, featureNum):
        super().__init__()
        
        self.num = featureNum
        
        self.net = nn.Sequential(
            nn.Linear(self.num, 50),
            nn.LeakyReLU(),
            nn.Linear(50, 60),
            nn.LeakyReLU(),
            nn.Linear(60, 50),
            nn.LeakyReLU(),
            nn.Linear(50, 3)
        )
        
    def forward(self, x):
        x = self.net(x)
        return x
    
classifier = ClassifyNetwork(x.shape[1]).to(device='cuda')
opt = optim.SGD(classifier.parameters(), lr=0.05)

In [7]:
# 2. loss function
def lossFunct(x, y):
    CELoss = torch.nn.CrossEntropyLoss()
    return CELoss(x, y)

In [8]:
# 3. network training
def training(epoches, optimizer, model, lossFunct, trainFeatures, trainLabels, validateFeatures, validateLabels):
    for epoch in range(1, epoches+1):
        trainPredictions = model(trainFeatures)
        trainLoss = lossFunct(trainPredictions, trainLabels)
        
        validatePredictions = model(validateFeatures)
        validateLoss = lossFunct(validatePredictions, validateLabels)
        
        optimizer.zero_grad()
        trainLoss.backward()
        optimizer.step()
        
        if epoch == 1 or epoch % 500 == 0 or epoch == epoches:
            print('Epoch {}, Training loss {}, Validation loss {}'.format(
                epoch, float(trainLoss), float(validateLoss)))

# Training:
(as tested before, epoches of 2500 is best)

In [9]:

training(
    epoches = 2500, 
    optimizer = opt,
    model = classifier,
    lossFunct = lossFunct,
    trainFeatures = trainFeatures,
    trainLabels = trainLabels,
    validateFeatures = validateFeatures,
    validateLabels = validateLabels
)

Epoch 1, Training loss 1.1028711795806885, Validation loss 1.1323316097259521
Epoch 500, Training loss 0.6354900598526001, Validation loss 0.6137858033180237
Epoch 1000, Training loss 0.45828181505203247, Validation loss 0.4824090898036957
Epoch 1500, Training loss 0.3360929787158966, Validation loss 0.35276153683662415
Epoch 2000, Training loss 0.25133752822875977, Validation loss 0.29698774218559265
Epoch 2500, Training loss 0.17777858674526215, Validation loss 0.22296260297298431


# Testing:

In [10]:
# 1. get predicted class:
def ans(x):
    tmp = torch.max(x, 0)[1]
    ans = 0
    if tmp == 1:
        ans = 1
    elif tmp == 2:
        ans = 2
    
    return ans

In [11]:
# 2. get total accuracy:
scale, correct = len(y), 0
for i in range(scale):
    if ans(classifier(x[i])) == y[i]:
        correct += 1
    
print(correct / scale)

0.8902729923138086


In [12]:
# 3. get warning accuracy:
scale, correct, size = 0, 0, len(y)
for i in range(size):
    if y[i] == 1:
        scale += 1
        if ans(classifier(x[i])) == 1:
            correct += 1
            
print(correct / scale)

0.9549071618037135


In [13]:
# 4. get landslide accuracy:
scale, correct, size = 0, 0, len(y)
for i in range(size):
    if y[i] == 2:
        scale += 1
        if ans(classifier(x[i])) == 2:
            correct += 1
            
print(correct / scale)

0.9653679653679653


In [19]:
# 5. get normal accuracy:
scale, correct, size = 0, 0, len(y)
for i in range(size):
    if y[i] == 0:
        scale += 1
        if ans(classifier(x[i])) == 0:
            correct += 1
            
print(correct / scale)

0.878474366893144


# Print num of different class:
(according to this answer, if we want to improve the accuracy of warning, we can increase the propertion of warning data)

In [18]:
scale = len(trainLabels)
normal,warning,landslide = 0, 0, 0
for i in range(scale):
    if trainLabels[i] == 0:
        normal += 1
    elif trainLabels[i] == 1:
        warning += 1
    else:
        landslide += 1
        
print(normal,warning,landslide)

5168 5219 5620
