In [20]:
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from utils import DNN   # 自定义的神经网络模块

tf.reset_default_graph()

train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [21]:
# 数据的样式
num_train = train_data.shape[0]
num_test = test_data.shape[0]
train_data.shape

(1314, 81)

In [22]:
test_data.shape

(146, 81)

In [23]:
# 数据处理第一步： 选择出数据的所有特征， （第二个特征到倒数第二个特征）
all_train_features = train_data.loc[:, 'MSSubClass': 'SaleCondition']
all_test_features = test_data.loc[:, 'MSSubClass': 'SaleCondition']
all_features = pd.concat((all_train_features, all_test_features))
all_train_features.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [24]:
all_test_features.shape

(146, 79)

In [25]:
all_features.shape

(1460, 79)

In [26]:
# 先处理已有的数值特征 -->取出数值特征
numeric_festures = all_features.dtypes[all_features.dtypes != "object"].index
numeric_festures


Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [27]:
# -->对数据进行数据标准化
all_features[numeric_festures] = all_features[numeric_festures].apply(lambda x: (x - x.mean()) / (x.std()))
all_features.shape

(1460, 79)

In [28]:
# 处理非数值
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.07335,-0.207948,-0.207071,0.651256,-0.517023,1.050634,0.878367,0.50984,0.575228,-0.288554,...,0,1,0,0,0,0,0,1,0,0
1,-0.872264,0.409724,-0.091855,-0.071812,2.178881,0.15668,-0.42943,-0.572637,1.171591,-0.288554,...,0,1,0,0,0,0,0,1,0,0
2,0.07335,-0.084413,0.073455,0.651256,-0.517023,0.984415,0.82993,0.322063,0.092875,-0.288554,...,0,1,0,0,0,0,0,1,0,0
3,0.309753,-0.413838,-0.096864,0.651256,-0.517023,-1.862993,-0.720051,-0.572637,-0.499103,-0.288554,...,0,1,0,1,0,0,0,0,0,0
4,0.07335,0.574436,0.37502,1.374324,-0.517023,0.951306,0.733056,1.360357,0.46341,-0.288554,...,0,1,0,0,0,0,0,1,0,0


In [29]:
# 填充丢失的数据 NA
all_features = all_features.fillna(all_features.mean())
all_features.shape


(1460, 331)

In [30]:
# 整理出可以输入模型的数据
num_train = train_data.shape[0]
num_test = test_data.shape[0]
train_features = all_features[:num_train].as_matrix().astype(np.float32)
test_features = all_features[num_train:].as_matrix().astype(np.float32)

train_labels = train_data['SalePrice'].as_matrix().astype(np.float32)
test_labels = test_data.SalePrice.as_matrix().astype(np.float32)

print(train_features.shape)
# 转换维度
# train_labels = train_labels[: , np.newaxis]
# train_labels = train_data['SalePrice'].apply(lambda x :(x - x.mean()) / (x.std()))
train_labels = ((train_data['SalePrice'] - train_data['SalePrice'].mean()) /  (train_data['SalePrice'].std())).as_matrix().astype(np.float32)
train_labels = train_labels[: , np.newaxis]
train_labels

(1314, 331)


array([[  3.36725563e-01],
       [ -6.58526988e-05],
       [  5.23831904e-01],
       ..., 
       [  2.68119901e-01],
       [  1.50302172e+00],
       [  1.89180374e+00]], dtype=float32)

In [31]:
train_features.shape[1]

331

In [32]:
# 创造Tensor
x = tf.constant(train_features, dtype=tf.float32, name='train_data')
y = tf.constant(train_labels, dtype=tf.float32, name='train_labels')
y

<tf.Tensor 'train_labels:0' shape=(1314, 1) dtype=float32>

In [33]:
w = tf.get_variable(initializer=tf.random_normal_initializer(), shape=(train_features.shape[1], 1),dtype=tf.float32, name='weights')
b = tf.get_variable(initializer=tf.zeros_initializer(), shape=(1), dtype=tf.float32, name='bais')

def logistic_model(x):
    logit = tf.matmul(x, w) + b
    return tf.sigmoid(logit)

y_ = logistic_model(x)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
print(y_.eval(session=sess))
loss = tf.reduce_mean(tf.square(y_ - y))
lr = 1e-1
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

for e in range(1000):
    sess.run(train_op)
    if (e+1) %100 == 0:
        loss_numpy = loss.eval(session=sess)
        print('Epoch %d: Loss: %.12f'% (e+1, loss_numpy))

[[  5.99354148e-01]
 [  1.39124768e-05]
 [  8.24963629e-01]
 ..., 
 [  5.71456134e-01]
 [  3.25574987e-02]
 [  8.18107277e-02]]
Epoch 100: Loss: 0.828283846378
Epoch 200: Loss: 0.752681553364
Epoch 300: Loss: 0.703959107399
Epoch 400: Loss: 0.672370791435
Epoch 500: Loss: 0.650522708893
Epoch 600: Loss: 0.636261761189
Epoch 700: Loss: 0.627539932728
Epoch 800: Loss: 0.622038722038
Epoch 900: Loss: 0.618303477764
Epoch 1000: Loss: 0.615407705307


In [34]:
# DNN 在utils.py中

dnn = DNN(x, [20, 20])   # 四层

loss_dnn = tf.reduce_mean(tf.square(dnn - y))
print(loss_dnn)
# R2 = 1 - ((dnn - y) ** 2).sum()/((y - np.mean(y)) ** 2).sum()
lr = 0.1
optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
train_op = optimizer.minimize(loss_dnn)





Tensor("Mean_1:0", shape=(), dtype=float32)


In [35]:
saver = tf.train.Saver()

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for e in range(10000):
    sess.run(train_op)
    if (e+1) %1000 == 0:
        loss_numpy = loss_dnn.eval(session=sess)
        
        print('Epoch {}: Loss: {}'.format(e+1, loss_numpy))
        saver.save(sess=sess, save_path='First_Save/model.ckpt', global_step=(e + 1))

Epoch 1000: Loss: 0.6062926054000854
Epoch 2000: Loss: 0.5971103310585022
Epoch 3000: Loss: 0.5902977585792542
Epoch 4000: Loss: 0.5846214294433594
Epoch 5000: Loss: 0.5814045071601868
Epoch 6000: Loss: 0.5796703100204468
Epoch 7000: Loss: 0.5786252021789551
Epoch 8000: Loss: 0.5779203772544861
Epoch 9000: Loss: 0.5772293210029602
Epoch 10000: Loss: 0.5764906406402588


In [36]:
# 跟进 --》 优化算法   Adadelta  Adagrad Adam momentum RMSprop SGD
# next_batch 参数设置 
# 占位符设置

# 训练深度，，次数不够

In [37]:
# 数据可视化 tf.summary

In [38]:
# 特征没有进行针对处理  特征筛选