# Week 1

In [None]:
#~source: https: //en.wikipedia.org/wiki/Gradient_descent
# code source: https://en.wikipedia.org/w/index.php?title=Gradient_descent&oldid=966271567

# 初始值
next_x = 6# We start the search at x = 6
# 步长系数（学习率）
gamma = 0.01# Step size multiplier
# 提前停止（系数变化小于此值时停止）
precision = 0.00001# Desired precision of result
# 最大迭代次数
max_iters = 10000# Maximum number of iterations

# Derivative
#function
#求导
def df(x):
  return 4 * x ** 3 - 9 * x ** 2

# 迭代
for i in range(max_iters):
    current_x = next_x
    
    #梯度下降
    next_x = current_x - gamma * df(current_x)
    print(i, next_x, df(current_x))

    # 提前停止的判定，计算系数变化了多少，小于阈值后提前停止
    step = next_x - current_x
    if abs(step) <= precision:
        break

print("Minimum at ", next_x)

# The output for the above will be something like 
# "Minimum at 2.2499646074278457"

In [None]:
# Source: https://github.com/mattnedrich/GradientDescentExample
# y = mx + b
# m is slope, b is y-intercept
# 计算均方误差 MSE
def compute_error_for_line_given_points(b, m, points):
    totalError = 0
    for i in range(0, len(points)):
        x = points[i, 0]
        y = points[i, 1]
        # 实际的y减去 mx+b 的 y
        totalError += (y - (m * x + b)) ** 2
    return totalError / float(len(points))

In [None]:
# Source: https://github.com/mattnedrich/GradientDescentExample

# 梯度下降用于线性回归
def step_gradient(b_current, m_current, points, learningRate):
    b_gradient = 0
    m_gradient = 0
    N = float(len(points))
    for i in range(0, len(points)):
        x = points[i, 0]
        y = points[i, 1]
        
        # 负梯度计算 （Loss Function 对 y_pre的导数，乘以 y_pred 对 m和b的导数
        # 除以N是因为有N个样本，相当于取平均值
        b_gradient += -(2/N) * (y - ((m_current * x) + b_current))
        m_gradient += -(2/N) * x * (y - ((m_current * x) + b_current))
    new_b = b_current - (learningRate * b_gradient)
    new_m = m_current - (learningRate * m_gradient)
    return [new_b, new_m]

In [None]:
# 执行梯度下降的函数
def gradient_descent_runner(points, starting_b, starting_m, learning_rate, num_iterations):
    b = starting_b
    m = starting_m
    for i in range(num_iterations):
        b, m = step_gradient(b, m, array(points), learning_rate)
    return [b, m]

In [None]:
# 用上面的代码合到一起，跑起来
def run():
    points = genfromtxt("data_linearreg.csv", delimiter=",")
    learning_rate = 0.0001
    initial_b = 0 # initial y-intercept guess
    initial_m = 0 # initial slope guess
    num_iterations = 1000
    print ("Starting gradient descent at b = {0}, m = {1}, error = {2}".format(initial_b, initial_m, compute_error_for_line_given_points(initial_b, initial_m, points)))
    print ("Running...")
    [b, m] = gradient_descent_runner(points, initial_b, initial_m, learning_rate, num_iterations)
    print ("After {0} iterations b = {1}, m = {2}, error = {3}".format(num_iterations, b, m, compute_error_for_line_given_points(b, m, points)) )

In [None]:
#source: https://stackoverflow.com/questions/3949226/calculating-pearson-correlation-and-significance-in-python

import math
import numpy as np
from random import random

# 计算 R Score
def pcc(X, Y):
   ''' Compute Pearson Correlation Coefficient. '''
   # Normalise X and Y
   X -= X.mean(0)
   Y -= Y.mean(0)
   # Standardise X and Y
   X /= X.std(0)
   Y /= Y.std(0)
   # Compute mean product
   return np.mean(X*Y)
 
def average(x):
    assert len(x) > 0
    return float(sum(x)) / len(x)

def pearson_def(x, y):
    assert len(x) == len(y)
    n = len(x)
    assert n > 0
    avg_x = average(x)
    avg_y = average(y)
    diffprod = 0
    xdiff2 = 0
    ydiff2 = 0
    for idx in range(n):
        xdiff = x[idx] - avg_x
        ydiff = y[idx] - avg_y
        diffprod += xdiff * ydiff
        xdiff2 += xdiff * xdiff
        ydiff2 += ydiff * ydiff

    # 协方差 除以 两个标准差的乘积
    return diffprod / math.sqrt(xdiff2 * ydiff2)

#main

# Using it on a random example

X = np.array([random() for x in range(100)])
Y = np.array([random() for x in range(100)])


# 两种等效的写法
pcof = pcc(X, Y)
print(pcof, ' is pcof')
 
pcoftwo = pearson_def(X, Y)
print(pcoftwo, ' is pcof second version')

# Week 2

In [None]:
#Source: https://machinelearningmastery.com/implement-logistic-regression-stochastic-gradient-descent-scratch-python/

from math import exp

# Make a prediction with coefficients
# 预测输出的概率 
# 先计算 y_hat， 再使用 sigmoid函数转化为概率
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return 1.0 / (1.0 + exp(-yhat))

# Estimate logistic regression coefficients using stochastic gradient descent
# 梯度下降 更新参数 (权重 w 和 b) 
# 虽然标注了SGD，但这不是一个随机梯度下降，而是使用 SSE作为损失函数的GD： （全量的）批量梯度下降（如果使用MSE，求导时也要取均值，乘以1/N）
def coefficients_sgd(train, l_rate, n_epoch):
    # 初始化系数
	coef = [0.0 for i in range(len(train[0]))]
    # 迭代过程
	for epoch in range(n_epoch):
		sum_error = 0
        # 循环 遍历每一个sample
		for row in train:
            # 计算 预测的prop
			yhat = predict(row, coef)
            # 计算 伪残差
			error = row[-1] - yhat
            # 计算 伪残差平方和，类似MSE
			sum_error += error**2
            # 在原有的 系数基础上，加上 负梯度乘以学习率乘以 sigmoid 的导数
            # 这里是在更新 b(bias) wx+b的b
			coef[0] = coef[0] + l_rate * error * yhat * (1.0 - yhat)
			#print(row, yhat, error, sum_error)
            # 这里是更新 w
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
		print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
	return coef

# Calculate coefficients
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]
l_rate = 0.3
n_epoch = 10
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

In [None]:
 
 # by R. Chandra
 #Source: https://github.com/rohitash-chandra/logistic_regression

from math import exp
import numpy as np
import random

SIGMOID = 1
STEP = 2
LINEAR = 3

 
random.seed()

class logistic_regression:

    # num_epocs 迭代次数
    # train_data 训练集
    # test_data 测试集
    # num_features 特征数量
    # learn_rate 学习率
	def __init__(self, num_epocs, train_data, test_data, num_features, learn_rate):
		self.train_data = train_data
		self.test_data = test_data 
		self.num_features = num_features
        # 输出数据维度，列数减去输入特征数量
		self.num_outputs = self.train_data.shape[1] - num_features
        # 样本量
		self.num_train = self.train_data.shape[0]
		self.w = np.random.uniform(-0.5, 0.5, num_features)  # in case one output class 
		self.b = np.random.uniform(-0.5, 0.5, self.num_outputs) 
		self.learn_rate = learn_rate
		self.max_epoch = num_epocs
		self.use_activation = SIGMOID #SIGMOID # 1 is  sigmoid , 2 is step, 3 is linear 
        # 用于记录偏导数
		self.out_delta = np.zeros(self.num_outputs)

		print(self.w, ' self.w init') 
		print(self.b, ' self.b init') 
		print(self.out_delta, ' outdel init')


    # 根据非激活值计算激活值
	def activation_func(self,z_vec):
		if self.use_activation == SIGMOID:
			y =  1 / (1 + np.exp(z_vec)) # sigmoid/logistic
		elif self.use_activation == STEP:
			y = (z_vec > 0).astype(int) # if greater than 0, use 1, else 0
			#https://stackoverflow.com/questions/32726701/convert-real-valued-numpy-array-to-binary-array-by-sign
		else:
			y = z_vec
		return y
 
    # 计算y的输出值
	def predict(self, x_vec ): 
		z_vec = x_vec.dot(self.w) - self.b 
		output = self.activation_func(z_vec) # Output  
		return output
	
	# 计算梯度 （已修正）
    # 预测值减实际值：梯度， 实际值减预测值：负梯度
	def gradient(self, x_vec, output, actual):   
		if self.use_activation == SIGMOID :
			out_delta =   (output - actual)*(output*(1-output)) 
		else: # for linear and step function  
			out_delta =   (output - actual) 
		return out_delta

	# 更新参数：这里有问题，正确写法是改成-=
	def update(self, x_vec, output, actual):      
		self.w+= self.learn_rate *( x_vec *  self.out_delta)
		self.b+=  (1 * self.learn_rate * self.out_delta)
 
	# 计算 SSE
    # 对于分类问题，这里的计算 将概率与实际值之差 视为伪残差
	def squared_error(self, prediction, actual):
		return  np.sum(np.square(prediction - actual))/prediction.shape[0]# to cater more in one output/class
 

	# 评估模型
	def test_model(self, data, tolerance):  

		num_instances = data.shape[0]

		class_perf = 0
		sum_sqer = 0   
        
        #循环遍历每一个样本
		for s in range(0, num_instances):	

			input_instance  =  self.train_data[s,0:self.num_features] 
			actual  = self.train_data[s,self.num_features:]  
			prediction = self.predict(input_instance)
			sum_sqer += self.squared_error(prediction, actual)

			# 设置分类概率阈值
			pred_binary = np.where(prediction > (1 - tolerance), 1, 0)

			print(s, actual, prediction, pred_binary, sum_sqer, ' s, actual, prediction, sum_sqer')

 
			# 预测正确 +1
			if( (actual==pred_binary).all()):
				class_perf =  class_perf +1   

		rmse = np.sqrt(sum_sqer/num_instances)

		percentage_correct = float(class_perf/num_instances) * 100 

		print(percentage_correct, rmse,  ' class_perf, rmse') 
		# note RMSE is not a good measure for multi-class probs

		return ( rmse, percentage_correct)





 
	def SGD(self):   
		
			epoch = 0 
			shuffle = True

			while  epoch < self.max_epoch:
				sum_sqer = 0
				for s in range(0, self.num_train): 

					if shuffle ==True:
						i = random.randint(0, self.num_train-1)

					input_instance  =  self.train_data[i,0:self.num_features]  
					actual  = self.train_data[i,self.num_features:]  
					prediction = self.predict(input_instance) 
					sum_sqer += self.squared_error(prediction, actual)
					self.out_delta = self.gradient( input_instance, prediction, actual)    # major difference when compared to GD
					#print(input_instance, prediction, actual, s, sum_sqer)
                    
                    # 使用梯度，更新参数
					self.update(input_instance, prediction, actual)

			
				print(epoch, sum_sqer, self.w, self.b)
				epoch=epoch+1  

			rmse_train, train_perc = self.test_model(self.train_data, 0.3) 
			rmse_test =0
			test_perc =0
			#rmse_test, test_perc = self.test_model(self.test_data, 0.3)
  
			return (train_perc, test_perc, rmse_train, rmse_test) 
				
	# 梯度下降
	def GD(self):   
		
			epoch = 0 
			while  epoch < self.max_epoch:
				sum_sqer = 0
				for s in range(0, self.num_train): 
					input_instance  =  self.train_data[s,0:self.num_features]  
					actual  = self.train_data[s,self.num_features:]   
					prediction = self.predict(input_instance) 
					sum_sqer += self.squared_error(prediction, actual) 
					self.out_delta+= self.gradient( input_instance, prediction, actual)    # this is major difference when compared with SGD

					#print(input_instance, prediction, actual, s, sum_sqer)
                
					# 使用梯度，更新参数
					self.update(input_instance, prediction, actual)

			
				print(epoch, sum_sqer, self.w, self.b)
				epoch=epoch+1  

			rmse_train, train_perc = self.test_model(self.train_data, 0.3) 
			rmse_test =0
			test_perc =0
			#rmse_test, test_perc = self.test_model(self.test_data, 0.3)
  
			return (train_perc, test_perc, rmse_train, rmse_test) 
				
	
 

#------------------------------------------------------------------
#MAIN



def main(): 

	random.seed()
	 

	 
	dataset = [[2.7810836,2.550537003,0],
		[1.465489372,2.362125076,0],
		[3.396561688,4.400293529,0],
		[1.38807019,1.850220317,0],
		[3.06407232,3.005305973,0],
		[7.627531214,2.759262235,1],
		[5.332441248,2.088626775,1],
		[6.922596716,1.77106367,1],
		[8.675418651,-0.242068655,1],
		[7.673756466,3.508563011,1]]


	train_data = np.asarray(dataset) # convert list data to numpy
	test_data = train_data

	 

	learn_rate = 0.3
	num_features = 2
	num_epocs = 20

	print(train_data)
	 

	lreg = logistic_regression(num_epocs, train_data, test_data, num_features, learn_rate)
	(train_perc, test_perc, rmse_train, rmse_test) = lreg.SGD()
	(train_perc, test_perc, rmse_train, rmse_test) = lreg.GD() 
	 

	#-------------------------------
	#xor data


	xor_dataset= [[0,0,0],
		[0,1,1],
		[1,0,1],
		[1,1,0] ]

	xor_data = np.asarray(xor_dataset) # convert list data to numpy



	num_epocs = 20
	learn_rate = 0.9
	num_features = 2

	lreg = logistic_regression(num_epocs, xor_data, xor_data, num_features, learn_rate)
	(train_perc, test_perc, rmse_train, rmse_test) = lreg.SGD()
	(train_perc, test_perc, rmse_train, rmse_test) = lreg.GD() 


if __name__ == "__main__": main()

In [7]:
import numpy as np
# 对数损失计算函数
def loss(h, y):
    return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()

# 下面是例子：用这个函数做了两个计算
h= np.random.rand(5)
print(h, ' h')
y= np.random.rand(5)
print(y, ' y')
  
log_loss = loss(h, y) # case of regression or prediction problem
print(log_loss)

y_ = np.ones(5)
print(y_, ' y_')
log_loss = loss(h, y_) # case of classification problem (assume class one 1s)
print(log_loss)

[0.29929918 0.86508897 0.02141501 0.58179658 0.52617156]  h
[0.67575181 0.12409989 0.26588349 0.61875227 0.17804302]  y
1.0273292766693767
[1. 1. 1. 1. 1.]  y_
1.2757319854390659
