# 4.4 如何快速得近似任何函数
This chapter uses neuralpy module, which is lastly supported by python 2.7 according to [pypy page](https://pypi.org/project/neuralpy/).

I got too many errors that blocking me to use this module even under py27, so I gave up on this section.

I mostly use python 3.5 as default version to code for these examples

In [None]:
import numpy as np
import pandas as pd
import random

In [None]:
random.seed(2016)
sample_size=50
sample = pd.Series(random.sample(range(-10000, 10000), sample_size))
x = sample/10000
y = x ** 2
print(x.describe())

# Type of pandas-Series
print(type(sample))

count = 0
dataset = [([x.at[count]], [y.at[count]])]
count = 1
while (count < sample_size):
    # print("Working on data item:" + str(count))

    # ix has been removed from pandas package, try iloc / loc / at
    # dataset = (dataset + [([x.ix[count, 0]], [y.at[count]])])
    dataset = (dataset + [([x.iloc[count]], [y.iloc[count]])])
    
    count = count + 1
    
print(dataset)

# 5.x 如何构建可定制的深度预测模型
This chapter uses scikit-neuralnetwork module, which is not included in anaconda by default, so at least you have to install it by yourself. Using the command line below:

```
pip install scikit-neuralnetwork
```

In [None]:
from sklearn import datasets

boston = datasets.load_boston()
x, y = boston.data, boston.target

# print(boston)

In [None]:
from sklearn import preprocessing

x_MinMax = preprocessing.MinMaxScaler()
y_MinMax = preprocessing .MinMaxScaler()

# print(x)

# turn array into a x * y matrix as you can leave 1 arguments unknown(i.e. -1) at most for creating it
# if you want to create a matrix of 5 rows and don't know how many columns there are,
# input 5 for args$1 and leave args$2 -1

y = np.array(y).reshape((len(y), 1))
# it's the same above and below
y1 = np.array(y).reshape(-1,1)
# print(y1)

x = x_MinMax.fit_transform(x)
y = y_MinMax.fit_transform(y)

x.mean(axis = 0)
# print(x)

print(y_MinMax.scale_)

In [None]:
# from sklearn.cross_validation import train_test_split
# sklearn has abandoned package cross_validation and replace it by model_selection
from sklearn.model_selection import train_test_split

np.random.seed(2016)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# print(len(x_train))
print(len(y_test))

print(y_test)

In [None]:
# running on higher version of python, you should have encounter a common error
# ModuleNotFoundError: No module named 'sklearn.cross_validation'
# somehow the mlp.py is not updated by maintainer that the import is not valid
# go to where you install anaconda ~/anaconda3/Lib/sites-packages/sknn/mlp.py
# edit import from sklearn.cross_validation to sklearn.model_selection
from sknn.mlp import Regressor, Layer

fit1 = Regressor (
    layers = [
        # 1st hidden layer
        Layer("Sigmoid", units = 6),
        # 2nd hidden layer
        Layer("Sigmoid", units = 14),
        # output layer
        Layer("Linear")
    ],
    learning_rate = 0.02,
    random_state = 2016,
    n_iter = 10
)

print("fitting model right now")
# you might encounter an error says '... from theano.tensor.signal import downsample'
# ImportError: cannot import name 'downsample'
# go to ~/anaconda3/envs/practice_env_0/Lib/site-packages/lasagne/layers/pool.py
# module 'downsample' has been changed to 'pool' so you have to update it yourself
# - from theano.tensor.signal import downsample
# + from theano.tensor.signal import pool
fit1.fit(x_train, y_train)

In [None]:
# print(fit1.get_parameters())
pred1_train = fit1.predict(x_train)

from sklearn.metrics import mean_squared_error

mse_1 = mean_squared_error(pred1_train, y_train)
print("Train ERROR = ", mse_1)

# 6.x 提高性能的一些技巧

In [None]:
# reLU
fit2 = Regressor (
    layers = [
    Layer("Rectifier", units = 6),
    Layer("Rectifier", units = 14),
    Layer("Linear")
    ],
    learning_rate = 0.02,
    random_state = 2016,
    n_iter = 10
)

print("fitting model right now")
fit2.fit(x_train, y_train)
pred2_train = fit2.predict(x_train)

mse_2 = mean_squared_error(pred2_train, y_train)
print("Train ERROR = ", mse_2)

In [None]:
# set iter for 100 times
fit3 = Regressor (
    layers = [
    Layer("Rectifier", units = 6),
    Layer("Rectifier", units = 14),
    Layer("Linear")
    ],
    learning_rate = 0.02,
    random_state = 2016,
    n_iter = 100
)

print("fitting model right now")
fit3.fit(x_train, y_train)
pred3_train = fit3.predict(x_train)

mse_3 = mean_squared_error(pred3_train, y_train)
print("Train ERROR = ", mse_3)

In [11]:
# regularization
fit4 = Regressor (
    layers = [
    Layer("Rectifier", units = 6),
    Layer("Rectifier", units = 14),
    Layer("Linear")
    ],
    learning_rate = 0.02,
    random_state = 2016,
    regularize = "L2",
    weight_decay = 0.001,
    n_iter = 100
)

print("fitting model right now")
fit4.fit(x_train, y_train)
pred4_train = fit4.predict(x_train)

mse_4 = mean_squared_error(pred4_train, y_train)
print("Train ERROR = ", mse_4)

fitting model right now
Train ERROR =  0.00738282398554


In [None]:
# error function MSE
pred4_test = fit4.predict(x_test)
mse4_test = mean_squared_error(pred4_test, y_test)
print(mse4_test)

from scipy.stats.stats import pearsonr

correl = pearsonr(pred4_test, y_test)

print("Test correlation is ", correl[0])
print("Test R^2 is ", correl[0] * correl[0])

In [None]:
# freeze network weight
fitFr = Regressor (
    layers = [
    Layer("Rectifier", units = 6, frozen = True),
    Layer("Rectifier", units = 14, frozen = True),
    Layer("Linear")
    ],
    learning_rate = 0.02,
    random_state = 2016,
    regularize = "L2",
    weight_decay = 0.001,
    n_iter = 100
)

In [14]:
import pickle
pickle.dump(fit4, open('./model/Boston_fit4.pkl', 'wb'))
# pickle.dump(fit1, open('Boston_fit1.pkl', 'wb'))
# pickle.dump(fit2, open('Boston_fit2.pkl', 'wb'))
# pickle.dump(fit3, open('Boston_fit3.pkl', 'wb'))

model = pickle.load(open('./model/Boston_fit4.pkl', 'rb'))

# 7.x 二元分类神经网络

In [None]:
import urllib

url = "http://goo.gl/j0Rvxq"
# if you have trouble accessing google, try download the dataset from datacastle
# https://www.datacastle.cn/dataset_description.html?type=dataset&id=794
# raw_data = urllib.urlopen(url)
# urllib.urlopen have been moved to urllib.request.urlopen on py3.x
raw_data = urllib.request.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter = ",")

In [None]:
import csv
# how to read data from csv file
# def reader(master_table):
# 	file = 'data/' + master_table + '.csv'
# 	df = pd.read_csv(file,encoding='GB18030')
# 	labels = list(df.columns.values)
# 	return labels, df

# call reader function	
# cols, raw_data = reader('diabetes')
# print(cols)
# for ix, r in raw_data.iterrows():
# 	if(ix < 10):
# 	    print("row", ix, r['BMI'])

# use function laodtxt is also good for csv files
dataset = np.loadtxt('data/diabetes.csv', delimiter = ",", skiprows = 1)

print(dataset.shape)

# transform to dataframe
data = pd.DataFrame(dataset)
# print(data.head(5))
# replace 0 value by NaN(missing value)
data = data.replace(0, np.nan)
# print(data.head(5))
# correct col 0 & 8 because these cols can have zero values(times, binary variable)
data[0].fillna(0, inplace = True)
# print(data.head(5))
data[8].fillna(-1, inplace = True)
# print(data.head(5))

# solving missing value
print(data.count())