In [1]:
import pandas as pd 
import logging
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split



In [2]:
#读入数据集
print ('loading the dataset...')
df = pd.read_csv('hw1-data.csv' , delimiter = ',')
X = df.values[:,:-1]
y = df.values[: ,-1]

loading the dataset...


In [3]:
#划分训练集和测试集
print ('Split into Train and Test...')
# train_test_split 参数介绍 
## train_test_split(train_data , test_data , test_size , random_state) 
## 其中test_size可选择正整数或者是[0,1]中的浮点数,表示测试集大小
## random_state为随机数种子编号，方便调试
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 100 , random_state = 10)

Split into Train and Test...


In [4]:
def feature_normalization(train , test) : 
    '''
    将每一维特征的数据利用线性变换[0,1]对训练数据和测试数据归一化,其中统计量来自于训练数据.
    Args:
        train - training set , 二维numpy数组(num_instances , num_features)
        test - test set , 二维numpy数组(num_instances , num_features)
    
    Returns:
        train_normalized - training set after normalization 
        test_normalized - test set after normalization
    '''
    mean = np.mean(train , axis = 0)
    std = np.std(train , axis = 0)
    try :
        Max = np.max(train , axis = 0)
        Min = np.min(train , axis = 0)
        #这个地方的减法运用了广播机制,考虑后缘维度(从末尾开始算起的维度)的轴长度相同,或者其中的一方长度为1.
        train_normalized = (train - Min) / (Max - Min)
        test_normalized = (test - Min) / (Max - Min)
    except ValueError :
        print ('There maybe some features are exactly same !')
    return train_normalized , test_normalized

In [5]:
#归一化
print ('Scaling all to [0,1]')
X_train , X_test = feature_normalization(X_train , X_test)
#添加偏置项 Add bias term
## np.hstack函数用来横向堆叠   np.hstack([arr1 , arr2])   要求arr1和arr2都是相同维数的矩阵,比如二维矩阵.然后其中行数要相同
## np.vstack函数用来纵向堆叠
X_train = np.hstack((X_train , np.ones([X_train.shape[0] , 1])))
X_test = np.hstack((X_test , np.ones([X_test.shape[0] , 1])))

Scaling all to [0,1]


In [6]:
def compute_square_loss(X , y , theta) :
    '''
    给定X,y,theta,计算用X*theta作为y预测的平方损失
    Args:
        X - 输入特征数据 , 二维numpy数组(num_instances , num_features)
        y - 标签label数据 , 一维numpy数组(num_instances)
        theta - 模型参数数据 , 一维numpy数组(num_features)
    Returns:
        loss - 平方损失 , 标量
    '''
    #这里用到了np.matmul的一些性质 y_ = X * theta.T
    #np.matmul(X,theta) = np.matmul(X,theta.T)
    y_ = np.matmul(X , theta)
    loss = np.mean((y_ - y) * (y_ - y))
    return loss

In [7]:
def compute_square_loss_gradient(X , y , theta) :
    ''' 
    计算平方损失函数关于参数theta的梯度
    Args:
        X - 输入特征数据 , 二维numpy数组(num_instances , num_features)
        y - 标签label数据 , 一维numpy数组(num_instances)
        theta - 模型参数数据 , 一维numpy数组(num_features)
    Returns:
        grad - 梯度向量 , 一维numpy数组(num_features)
    '''
    grad = np.matmul(X.T  , np.matmul(X , theta) - y)
    return grad

In [8]:
theta = np.ones(X_train.shape[1])
compute_square_loss(X_train , y_train , theta)
compute_square_loss_gradient(X_train , y_train , theta)

array([2607.01320543, 2558.32650202, 2505.05425816, 2474.08147112,
       2424.54686361, 2379.89122572, 2290.86331386, 2290.86331386,
       2190.02613459, 2005.69048982, 1836.4166322 , 1730.17451019,
       1450.02761001, 1245.80012271,  821.04028085,  734.55607357,
        524.34057467,   90.77052471, 2314.7420968 , 2314.7420968 ,
       2314.7420968 , 2148.83249081, 2148.83249081, 2148.83249081,
       1989.72414034, 1989.72414034, 1989.72414034, 1915.89546722,
       1915.89546722, 1915.89546722, 1874.89861947, 1874.89861947,
       1874.89861947, 1185.22961119, 1185.22961119, 1185.22961119,
       1439.88546708, 1439.88546708, 1439.88546708, 1583.93020895,
       1583.93020895, 1583.93020895, 1649.35361683, 1649.35361683,
       1649.35361683, 1685.25585253, 1685.25585253, 1685.25585253,
       2623.76814349])

In [9]:
X = np.array([[1,2,3],[4,5,6]])
w = np.array([1,2,3])
print (np.matmul(X , w))
print (np.matmul(X , w.T))

[14 32]
[14 32]
