In [None]:
import numpy as np

In [None]:
vector = np.array([1,2,3,4,5])

In [None]:
row_vector = vector.reshape((5,1))
#row_vector = vector.reshape(5,1)
row_vector

In [None]:
column_vector = vector.reshape((1,5))
#column_vector = vector.reshape(1,5)
column_vector

In [None]:
single_feature_matrix = vector.reshape((1,5))
#single_feature_matrix = vector.reshape(1,5)
single_feature_matrix

In [None]:
multiple_feature_matrix = np.array([[1,2,3,4,5],[6,7,8,9,10],[11,12,13,14,15]])
multiple_feature_matrix

In [None]:
vector = np.array([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
multiple_feature_matrix = vector.reshape(3,5)
multiple_feature_matrix

In [None]:
all_zeros = np.zeros((5,3))
all_zeros

In [None]:
all_ones = np.ones((5,3))
all_ones

In [None]:
## Bouston Housing Dataset:   https://codedragon.tistory.com/8402

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_boston
boston = load_boston() 
california = fetch_california_housing()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy.stats as stats
import math

In [None]:
%matplotlib inline 

In [None]:
x = np.linspace(-4,4,100)
for mean, variance in [(0,0.7),(0,1),(1,1.5),(-2,0.5)]:
    sigma = math.sqrt(variance)
    plt.plot(x, stats.norm.pdf(x, mean, variance))
plt.show()

In [None]:
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset.head()

In [None]:
dataset['target'] = boston.target
dataset.head()

In [None]:
mean_expected_value = dataset['target'].mean()
mean_expected_value

In [None]:
Squared_errors = pd.Series(mean_expected_value - dataset['target'])**2 
SSE = np.sum(Squared_errors)
print ('Sum of Squared Errors (SSE): %01.f' % SSE)

In [None]:
density_plot = Squared_errors.plot(kind = 'hist')

In [None]:
def standard_deviation(variable, bias=0):
    observations = float(len(variable))
    return np.sqrt(np.sum((variable - np.mean(variable))**2) / (observations-min(bias,1)))

print ('Our function\'s result: %0.5f against Numpy\'s: %0.5f' % (standard_deviation(dataset['RM']), np.std(dataset['RM'])))

In [None]:
## 피어슨 선형상관계수 https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html

In [None]:
def covariance(variable_1, variable_2, bias=0):
    observations = float(len(variable_1))
    return np.sum((variable_1 - np.mean(variable_1)) * (variable_2 - np.mean(variable_2))) / (observations-min(bias,1))

def standardize(variable):
    return (variable - np.mean(variable)) / np.std(variable)

def correlation(var1,var2,bias=0):
    return covariance(standardize(var1), standardize(var2),bias)

from scipy.stats.stats import pearsonr
print ('Our correlation estimation: %0.5f' % (correlation(dataset['RM'], dataset['target'])))
print ('Correlation from Scipy pearsonr estimation: %0.5f' % pearsonr(dataset['RM'], dataset['target'])[0])

In [None]:
x_range = [dataset['RM'].min(),dataset['RM'].max()]
y_range = [dataset['target'].min(),dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
meanY = scatter_plot.plot(x_range, [dataset['target'].mean(),dataset['target'].mean()], '--', color='red', linewidth=1)
meanX = scatter_plot.plot([dataset['RM'].mean(),dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
y = dataset['target']
X = dataset['RM']
X = sm.add_constant(X)

In [None]:
X.head()

In [None]:
linear_regression = smf.ols(formula='target ~ RM', data=dataset)
fitted_model = linear_regression.fit()

In [None]:
linear_regression = sm.OLS(y,X)
fitted_model = linear_regression.fit()
fitted_model.summary()

In [None]:
print (fitted_model.params)
betas = np.array(fitted_model.params)
fitted_values = fitted_model.predict(X)

In [None]:
RM = 5
Xp = np.array([1,RM])
print ("Our model predicts if RM = %01.f the answer value is %0.1f" % (RM, fitted_model.predict(Xp)))

In [None]:
x_range = [dataset['RM'].min(),dataset['RM'].max()]
y_range = [dataset['target'].min(),dataset['target'].max()]
scatter_plot = dataset.plot(kind='scatter', x='RM', y='target', xlim=x_range, ylim=y_range)
meanY = scatter_plot.plot(x_range, [dataset['target'].mean(),dataset['target'].mean()], '--', color='red', linewidth=1)
meanX = scatter_plot.plot([dataset['RM'].mean(),dataset['RM'].mean()], y_range, '--', color='red', linewidth=1)
regression_line = scatter_plot.plot(dataset['RM'], fitted_values, '-', color='orange', linewidth=1)

In [None]:
predictions_by_dot_product = np.dot(X,betas)
print ("Using the prediction method: %s" % fitted_values[:10])
print ("Using betas and a dot product: %s" % predictions_by_dot_product[:10])

In [None]:
residuals = dataset['target']-fitted_values
normalized_residuals = standardize(residuals)

In [None]:
residual_scatter_plot = plt.plot(dataset['RM'], normalized_residuals,'bp')
plt.xlabel('RM') 
plt.ylabel('Normalized residuals') 
mean_residual = plt.plot([int(x_range[0]),round(x_range[1],0)], [0,0], '-', color='red', linewidth=2)
upper_bound = plt.plot([int(x_range[0]),round(x_range[1],0)], [3,3], '--', color='red', linewidth=1)
lower_bound = plt.plot([int(x_range[0]),round(x_range[1],0)], [-3,-3], '--', color='red', linewidth=1)
plt.grid()

In [None]:
from sklearn import linear_model
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)

In [None]:
observations = len(dataset)
X = dataset['RM'].values.reshape((observations,1)) # X should be always a matrix, never a vector
y = dataset['target'].values # y can be a vector

In [None]:
linear_regression.fit(X,y)

In [None]:
print (linear_regression.coef_)
print (linear_regression.intercept_)

In [None]:
print (linear_regression.predict(X)[:10])

In [None]:
Xp = np.column_stack((X,np.ones(observations)))
v_coef = list(linear_regression.coef_) + [linear_regression.intercept_]

In [None]:
np.dot(Xp,v_coef)[:10]

In [None]:
from sklearn.datasets import make_regression
HX, Hy = make_regression(n_samples=10000000, n_features=1, n_targets=1, random_state=101)

In [None]:
%%time
sk_linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)
sk_linear_regression.fit(HX,Hy)

In [None]:
%%time
sm_linear_regression = sm.OLS(Hy,sm.add_constant(HX))
sm_linear_regression.fit()