<a href="https://colab.research.google.com/github/CHDS-2002/Data-Science-Technologies/blob/main/datascience.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Технологии Data Science**

## **Нейронная сеть**

Нейронная сеть - математическая модель, математическая функция, имитирующая процессы, протекающие в мозге любого живого организма.

Имеет огромное прикладное применение в науке, бизнесе, IT, медицине, экономике, различных промышленных отраслях.

Имеет достаточно примитивное строение: входной слой, скрытые слои, выходной слой.

Обучается по методу обучения с учителем: принимает данные, обрабатывает при помощи весов нейронов, выдаёт выходные данные, происходит суммирование квадратов ошибок - разностей между ожидаемым результатом и полученым; происходит корректировка весов нейронов методом обратного распространения.

Основные параметры нейронной сети - веса.
Веса каждого слоя нейронной сети представляются в виде матриц - математических объектов прямоугольной формы, состоящей из m строк и n столбцов. Могут представляться в виде тензоров.

# **Обучение нейронной сети**

  Обучение нейронной сети можно произвести при помощи алгоритмов глубокого обучения: упругое распространение, обратное распространения, градиентный спуск, генетические алгоритмы.

## **Задачи классификации**

# **Решение задач классификации при помощи нейронной сети**

# **Задачи регрессии**

# **Решение задач регрессии при помощи нейронной сети**

## **Применение библиотек глубокого и машинного обучения в решении задач классификации и регрессии**

In [None]:
!pip install tensorflow

In [None]:
import numpy as np
import torchvision
import pandas as pd
import torch as tor
import sklearn as sk
from time import time
import torch.nn as nn
import tensorflow as tf
import torch.optim as optim
import torch.nn.functional as F
from torchvision.transforms import transforms
from tensorflow.keras.preprocessing import image

# **Решение задачи классификации при помощи PyTorch**

In [None]:
###################################
#                                 #
#             PyTorch             #
#                                 #
###################################

###################################
#         Classification          #
###################################

# Prepare the dataset for the classification
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)

trainset = torchvision.datasets.CIFAR10(root='./data',
                                        train=True,
                                        download=True,
                                        transform=transform)
trainloader = tor.utils.data.DataLoader(trainset, batch_size=4, shuffle=True,
                                        num_workers=2)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = tor.utils.data.DataLoader(testset, batch_size=4,
                                       shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

#Define the CNN architecture
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(3, 6, 5)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(6, 16, 5)
    self.fc1 = nn.Linear(16 * 5 * 5, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = x.view(-1, 16 * 5 * 5)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

net = Net()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

# Train the network
for epoch in range(2):  # loop over the dataset multiple times
  running_loss = 0.0

  for i, data in enumerate(trainloader, 0):
    inputs, labels = data

    optimizer.zero_grad()

    outputs = net(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

    if i % 2000 == 1999: # print every 2000 mini-batches
      print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 2000))
      running_loss = 0.0

print('Finished Training')

# Test the network
correct = total = 0

with tor.no_grad():
  for data in testloader:
    images, labels = data
    outputs = net(images)
    _, predicted = tor.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

###################################
#         Classification          #
###################################

###################################
#                                 #
#             PyTorch             #
#                                 #
###################################

In [None]:
!pip install bioinfokit

# **Решение задачи регрессии при помощи PyTorch**

In [None]:
###################################
#                                 #
#             PyTorch             #
#                                 #
###################################

###################################
#           Regression            #
###################################

from bioinfokit.visuz import stat
from sklearn.metrics import r2_score
from bioinfokit.analys import get_data

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Times New Roman'

# Prepare the dataset for the regression
ds = get_data('plant_richness_lr').data
ds.head(10)

# convert variables to PyTorch tensor
x = tor.tensor(ds[['area']].values, dtype=tor.float32)
y = tor.tensor(ds[['ntv_rich']].values, dtype=tor.float32)

# build regression model

in_features = 1 # number of independent variables
out_features = 1 # dimension of predicted variables

# bias is default true and can be skipped
regression = tor.nn.Linear(in_features=in_features, out_features=out_features,
                           bias=True)

# define loss function
mse_loss = tor.nn.MSELoss()

# define gradient descent optimizer
optimizer = tor.optim.SGD(regression.parameters(), lr=0.002)

# set epoch to 6K
epochs = 6000
for i in range(epochs):
  # predict model with current regression parameters
  # forward pass (feed the data to model)
  pred_y = regression(x)
  # calculate loss function
  step_loss = mse_loss(pred_y, y)

  # Backward to find the derivatives of the loss function with respect to
  # regression parameters
  # make any stored gradients to zero
  # backward pass (go back and update the regression parameters to minimize the
  # loss)
  optimizer.zero_grad()
  step_loss.backward()
  # update with current step regression parameters
  optimizer.step()
  print('epoch [{}], Loss: {:.2f}'.format(i, step_loss.item()))

# estimate the regression parameters
# bias b (offset or y-intercept)
regression.bias.item()
# weight (w)
regression.weight.item()

# detach will not build a gradient computational graph (no backpropagation)
pred_y = regression(x).detach()
ds['yhat'] = pred_y.numpy()
stat.regplot(df=ds, x='area', y='ntv_rich', yhat='yhat')

# model performance
r2_score(y_true=y, y_pred=pred_y.detach().numpy())

# prediction
area = 3
# predict y (ntv_rich) value when x(area) is 3
pred_y = regression(tor.tensor([[area]], dtype=tor.float32)).detach()
pred_y.item()

###################################
#           Regression            #
###################################

###################################
#                                 #
#             PyTorch             #
#                                 #
###################################

ModuleNotFoundError: No module named 'bioinfokit'

In [None]:
!wget --no-check-certificate \
    https://upload.wikimedia.org/wikipedia/commons/b/b5/Lion_d%27Afrique.jpg \
    -O /tmp/lion.jpg

# **Решение задачи классификации при помощи TensorFlow**

In [None]:
###################################
#                                 #
#           Tensorflow            #
#                                 #
###################################

###################################
#         Classification          #
###################################

# Prepare the dataset for the classification
classification = tf.keras.applications.Xception(
    include_top=True,
    weights='imagenet',
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000,
    classifier_activation='softmax',
)

img_path = '/tmp/lion.jpg'
img = tf.keras.preprocessing.image.load_img(img_path, target_size=(299, 299))
x = tf.keras.preprocessing.image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = tf.keras.applications.xception.preprocess_input(x)

preds = classification.predict(x)
# decode the results into a list of tuples (class, description, probability)
# (one such list for each sample in the batch)
print('Predicted:', tf.keras.applications.xception.decode_predictions(preds, top=3)[0])

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.cifar10.load_data()
image = X_train[785]
plt.imshow(image)
plt.show()

# Data preprocessing

X_train = X_train / 255
X_test = X_test / 255

# Classification
classification = tf.keras.Sequential(
    [
        tf.keras.layers.Conv2D(32, (3,3), padding='same', activation='relu', input_shape=(32, 32, 5)),
        tf.keras.layers.MaxPooling2D((2, 2), strides=2),

        tf.keras.layers.Conv2D(64, (3,3), padding='same', activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2), strides=2),

        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(100, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax')
    ]
)

classification.summary()

# plot_model
tf.keras.utils.plot_model(
    classification,
    to_file='classfication.png',
    show_shapes=True,
    show_layer_names=True,
    rankdir='TB',
    expand_nested=True,
    dpi=96,
)

# compliling the model
classification.compile(optimizer='SGD',
                       loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                       metrics=['accuracy'])

# checkpoint
checkpoint_filepath = '/tmp/checkpoint'
'''classification_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='loss',
    mode='min',
    save_best_only=True
)'''

# callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
]

#saved_model = tf.keras.models.load_model(checkpoint_filepath)
history = classification.fit(X_train, y_train, epochs=600,
                             validation_data=(X_test, y_test),
                             callbacks=callbacks)

metrics_df = pd.DataFrame(history.history)
metrics_df[['loss', 'val_loss']].plot()
metrics_df[['accuracy', 'val_accuracy']].plot()

# saving
classification.save('classification.h5')

# loaded
load_saved_model = tf.keras.models.load_model('classification.h5')
load_saved_model.summary()

###################################
#         Classification          #
###################################

###################################
#                                 #
#           Tensorflow            #
#                                 #
###################################

In [None]:
!pip install -q seaborn

# **Решение задачи регрессии при помощи TensorFlow**

In [None]:
###################################
#                                 #
#           Tensorflow            #
#                                 #
###################################

###################################
#           Regression            #
###################################

import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

# Prepare the dataset for the regression

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mog.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                'Acceleration', 'Model Year', 'Origin']

raw_dataset = pd.read_csv(url, names=column_names,
                          na_values='?', comment='\t',
                          sep=' ', skipinitialspace=True)

dataset = raw_dataset.copy()
dataset.tail()

dataset.isna().sum()
dataset = dataset.dropna()

dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
dataset = pd.get_dummies(dataset, columns=['Origin'], prefix='', prefix_sep='')
dataset.tail()

# The train the neural networks for the regression
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

sns.pairplot(train_dataset[['MPG', 'Cylinders', 'Displacement', 'Weight']],
             diag_kind='kde')

train_dataset.describe().transpose()

train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('MPG')
test_labels = test_features.pop('MPG')

# Normalization
train_dataset.describe().transpose()[['mean', 'std']]

normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(np.array(train_features))

print(normalizer.mean.numpy())

first = np.array(train_features[:1])

with np.printoptinos(precision=2, suppress=True):
  print('First example:', first, '\n')
  print('Normalized:', normalizer(first).numpy())

horsepower = np.array(train_features['Horsepower'])

horsepower_normalizer = layers.Normalization(input_shape=[1,], axis=None)
horsepower_normalizer.adapt(horsepower)

horsepower_model = tf.keras.Sequential([
    horsepower_normalizer,
    layers.Dense(units=1)
])

horsepower_model.summary()

horsepower_model.predict(horsepower[:10])

horsepower_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error'
)

%%time
history = horsepower_model.fit(
    train_features['Horsepower'],
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split=0.2
)

hist = pd.DataFrame(history.history)
hist['epoch'].history.epoch
hist.tail()

def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

test_results = {}

test_results['horsepower_model'] = horsepower_model.evalueate(
    test_features['Horsepower'],
    test_labels, verbose=0
)

x = tf.linspace(0.0, 250, 251)
y = horsepower_model.predict(x)

def plot_horsepower(x, y):
  plt.scatter(train_features['Horsepower'], train_labels, label='Data')
  plt.plot(x, y, color='k', label='Predictions')
  plt.xlabel('Horsepower')
  plt.ylabel('MPG')
  plt.legend()

plot_horsepower(x, y)

linear_model = tf.keras.Sequential([
    normalizer,
    layers.Dense(units=1)
])

linear_model.predict(train_features[:10])
linear_model.layers[1].kernel

linear_model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mean_absolute_error'
)

%%time
history = linear_model.fit(
    train_features,
    train_labels,
    epochs=100,
    # Suppress logging.
    verbose=0,
    # Calculate validation results on 20% of the training data.
    validation_split=0.2
)

plot_loss(history)

test_results['linear_model'] = linear_model.evaluate(
    test_features, test_labels, verbose=0
)

def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))

  return model

dnn_horsepower_model = build_and_compile_model(horsepower_normalizer)
dnn_horsepower_model.summary()

%%time
history = dnn_horsepower_model.fit(
    train_features['Horsepower'],
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100
)

plot_loss(history)

x = tf.linspace(0.0, 250, 251)
y = dnn_horsepower_model.predict(x)

plot_horsepower(x, y)

test_results['dnn_horsepower_model'] = dnn_horsepower_model.evaluate(
    test_features['Horsepower'], test_labels,
    verbose=0
)

dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

%%time
history = dnn_model.fit(
    train_features,
    train_labels,
    validation_splot=0.2,
    verbose=0, epochs=100
)

plot_loss(history)

test_results['dnn_model'] = dnn_model.evaluate(test_features, test_labels, verbose=0)

pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T

test_predictions = dnn_model.predict(test_features).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

dnn_model.save('dnn_model')

reloaded = tf.keras.models.load_model('dnn_model')

test_results['reloaded'] = reloaded.evaluate(
    test_features, test_labels, verbose=0
)

pd.DataFrame(test_results, index=['Mean absolute error [MPG]']).T

###################################
#           Regression            #
###################################

###################################
#                                 #
#           Tensorflow            #
#                                 #
###################################

In [None]:
!pip install keras



# **Решение задачи классификации при помощи scikit-learn**

In [None]:
###################################
#                                 #
#          scikit-learn           #
#                                 #
###################################

###################################
#         Classification          #
###################################

import keras
import numpy as np
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import RegexpTokenizer
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
import keras.layers as l
from keras import backend

if 'tensorflow' == backend.backend():
  import tensorflow as tf

from keras.backend.tensorflow_backend import session
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = '0'
set_session(tf.Session(config=config))

#Loading Data
def get_token():
  categories = None
  remove = ('headers', 'footers', 'quotes')

  data_train = fetch_20newsgroups(subset='train', categories=categories,
                                  shuffle=True, random_state=42,
                                  remove=remove)
  label_train = data_train.target
  t = RegexpTokenizer("[\w]+")
  content = []
  for doc in data_train.data:
    content.append(t.tokenize(doc))
  print(len(content))
  return {'data': content, 'label': label_train}

def create_tokenizer(corpus):
  t = Tokenizer()
  t.fit_on_texts(corpus)
  return t

def encode_docs(tokenizer, max_length, docs):
  encoded = tokenizer.texts_to_sequences(docs)
  padded = pad_sequences(encoded, maxlen=max_length, padding='post')
  return padded

def cnn_model(X_train, Y_train, tokenizer, max_length):
  model = Sequential()
  model.add(l.InputLayer(input_shape=(max_length,), dtype='int32'))
  model.add(l.Embedding(vocab_size, 100, input_length=max_length))
  model.add(l.GRU(output_dim=Y_train.shape[-1], return_sequences=True))
  model.add(l.MaxPooling1D(pool_size=2))
  model.add(l.Flatten())
  model.add(l.Dense(20, activation='softmax'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  return model

def create_label(n):
  label = list()
  for i in range(0, 20):
    if i != n:
      label.append(0)
    else:
      label.append(1)

  return label

# Tokenizing data
train_data = get_token()['data']
label = get_token()['label']
train_label = np.array([create_label(n) for n in label])

x_train, x_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.3)
tokenizer = create_tokenizer(train_data)
vocab_size = len(tokenizer.word_index) + 1
max_length = max([len(doc) for doc in train_data if doc is not None])

x_train = encode_docs(tokenizer, max_length, x_train)
x_test = encode_docs(tokenizer, max_length, x_test)
print(x_train.shape)
print(y_train.shape)
# Creating model
model = cnn_model(x_train, y_train, tokenizer, max_length)
model.fit(x_train, y_train, batch_size=100, epochs=50, verbose=1, callbacks=None,
          validation_data=(x_test, y_test))

# Tokenizing data
train_data = get_token()['data']
label = get_token()['label']
train_label = np.array([create_label(n) for n in label])

x_train, x_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.3)
tokenizer = create_tokenizer(train_data)
vocab_size = len(tokenizer.word_index) + 1
max_length = max([len(doc) for doc in train_data if doc is not None])

x_train = encode_docs(tokenizer, max_length, x_train)
x_test = encode_docs(tokenizer, max_length, x_test)
print(x_train.shape)
print(y_train.shape)
#Creating model
model = cnn_model(x_train, y_train, tokenizer, maxlength)
model.fit(x_train, y_train, batch_size=100, epochs=50, verbose=1, callbacks=None,
          validation_data=(x_test, y_test))

###################################
#         Classification          #
###################################

###################################
#                                 #
#          scikit-learn           #
#                                 #
###################################

# **Решение задачи регрессии при помощи scikit-learn**

In [None]:
###################################
#                                 #
#          scikit-learn           #
#                                 #
###################################

###################################
#           Regression            #
###################################

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

# Generating synthetic data
X = np.random.rand(1000, 10) # 1000 samples, 10 features
Y = X @ np.random.rand(10, 1) + np.random.rand(1000, 1) # Linear
# combination with noise

# Splitting the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
                                                    random_state=42)

# Initializing the MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)

# Training the model
mlp.fit(X_train, Y_train.ravel())

# Making predictions
Y_pred = mlp.predict(X_test)

# Calculating the mean squared error
mse = mean_squared_error(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')
print(mlp)

# Prepare the dataset for the regression

###################################
#           Regression            #
###################################

###################################
#                                 #
#          scikit-learn           #
#                                 #
###################################

Mean Squared Error: 0.0905711752751663
MLPRegressor(max_iter=1000, random_state=42)


# **Заключение, выводы**

In [None]:
!pip install pipreqs

Collecting pipreqs
  Downloading pipreqs-0.5.0-py3-none-any.whl.metadata (7.9 kB)
Collecting docopt==0.6.2 (from pipreqs)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import torch as th
import torch.nn as nn

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(3, 6, 5)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(6, 16, 5)
    self.fc1 = nn.Linear(16 * 5 * 5, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = self.pool(F.relu(self.conv2(x)))
    x = x.view(-1, 16 * 5 * 5)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

class RegressionModel(nn.Module):
  def __init__(self, input_size):
    super(RegressionModel, self).__init__()
    self.hidden = nn.Linear(input_size, 64)
    self.output = nn.Linear(64, 1)

  def forward(self, x):
    x = th.relu(self.hidden(x))
    x = self.output(x)
    return x

import torch.optim as optim

net = Net()
optimizer = optim.SGD(net.parameters(), lr=1e-4, weight_decay=1e-2, momentum=0.9)

for _ in range(10000): # Number of epochs
  input = th.randn(1)
  print(input)
  output = net(input)
  loss = th.abs(output)
  net.zero_grad()
  loss.backward()
  optimizer.step()

net.eval()

tensor([-1.0101])


RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1]