In [16]:
import numpy as np
import pandas as pd
data_url = "http://lib.stat.cmu.edu/datasets/boston"
boston=raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
X, y = data,target

In [7]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X,y)

In [8]:
print (regression.score(X,y))


0.7406426641094095


R2, also known as coefficient of determination, is a measure ranging from 0 to 1. It shows how using a regression model is better in predicting the response than using a simple mean.

In [14]:
mean_y = np.mean(y)
squared_errors_mean = np.sum((y-mean_y)**2)
squared_errors_model = np.sum((y-regression.predict(X))**2)
R2 = 1- (squared_errors_model / squared_errors_mean)
print (R2)

0.7406426641094095


Mixing Variables of Different Types

In [22]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
lbl = LabelEncoder()
enc = OneHotEncoder()
qualitative = ['red', 'red', 'green', 'blue',
 'red', 'blue', 'blue', 'green']
labels = lbl.fit_transform(qualitative).reshape(8,1)
print(enc.fit_transform(labels).toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


In [29]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test =train_test_split(poly_X,y, test_size=0.33, random_state=42)
reg_regression = Ridge(alpha=0.1)
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f'% r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.671


Switching to Probabilities


In [30]:
a = np.array([0, 0, 0, 0, 1, 1, 1, 1])
b = np.array([1, 2, 3, 4, 5, 6, 7, 8]).reshape(8,1)
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(b,a)
print (regression.predict(b)>0.5)

[False False False False  True  True  True  True]


In statistics, linear regression can’t solve classification problems because doing so  would create a series of violated statistical assumptions.So, for statistics, using  regression models for classification purposes is mainly a theoretical problem, not a practical one.

#Logistic regression
Logistic regression is the same as a linear regression except that the y data contains  integer numbers indicating the class relative to the observation

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
binary_y = np.array(y >= 40).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,
 binary_y, test_size=0.33, random_state=5)
logistic = LogisticRegression()
logistic.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
print('In-sample accuracy: %0.3f' %
 accuracy_score(y_train, logistic.predict(X_train)))
print('Out-of-sample accuracy: %0.3f' %
 accuracy_score(y_test, logistic.predict(X_test)))

In-sample accuracy: 0.982
Out-of-sample accuracy: 0.964


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
for var,coef in zip(boston.columns,logistic.coef_[0]):
 print ("%7s : %7.3f" %(var, coef))

      0 :   0.071
      1 :   0.005
      2 :   0.106
      3 :  -0.151
      4 :   0.012
      5 :   1.084
      6 :  -0.006
      7 :  -0.258
      8 :   0.541
      9 :  -0.013
     10 :  -0.935


In [35]:
print('\nclasses:',logistic.classes_)
print('\nProbs:\n',logistic.predict_proba(X_test)[:3,:])


classes: [0 1]

Probs:
 [[2.42452288e-01 7.57547712e-01]
 [9.88675290e-01 1.13247096e-02]
 [9.99866863e-01 1.33137077e-04]]


In [54]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
check = [2**i for i in range(8)]
for i in range(2**7+1):
 X_train = np.column_stack((X_train,np.random.random(
 X_train.shape[0])))
 X_test = np.column_stack((X_test,np.random.random(
 X_test.shape[0])))
 regression.fit(X_train, y_train)
 if i in check:
  print ("Random features: %i -> R2: %0.3f" %(i, r2_score(y_train,regression.predict(X_train))))
regression.fit(X_train, y_train)
print ('R2 %0.3f'% r2_score(y_test,regression.predict(X_test)))

Random features: 1 -> R2: 0.740
Random features: 2 -> R2: 0.740
Random features: 4 -> R2: 0.740
Random features: 8 -> R2: 0.741
Random features: 16 -> R2: 0.749
Random features: 32 -> R2: 0.763
Random features: 64 -> R2: 0.777
Random features: 128 -> R2: 0.828
R2 0.574


solving overfitting by using selection(L1 regularization)


In [55]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test =train_test_split(poly_X,y, test_size=0.33, random_state=42)
from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=0.1)
reg_regression.fit(X_train,y_train)
print ('R2: %0.3f'% r2_score(y_test,reg_regression.predict(X_test)))

R2: 0.671


Stochastic gradient descent (SGD)

In [61]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
X_train, X_test, y_train, y_test = train_test_split(X,
 y, test_size=0.33, random_state=42)
SGD = SGDRegressor(penalty=None,
 learning_rate='invscaling',
 eta0=0.01, power_t=0.25)
power = 17
check = [2**i for i in range(power+1)]
for i in range(400):
 for j in range(X_train.shape[0]):
  SGD.partial_fit(X_train[j,:].reshape(1,13),y_train[j].reshape(1,))
  count = (j+1) + X_train.shape[0] * i
  if count in check:
   R2 = r2_score(y_test,SGD.predict(X_test))
   print ('Example %6i R2 %0.3f coef: %s' % (count, R2, ' '.join(map(lambda x:'%0.3f' %x, SGD.coef_))))

Example      1 R2 -50921912.058 coef: 1.494 0.000 2.643 0.000 0.090 0.903 14.118 0.317 3.504 97.236 2.949 55.436 2.632
Example      2 R2 -183819380450548.344 coef: -293.139 0.000 -3566.759 0.000 -235.824 -2548.106 -39582.557 -2052.747 -1750.502 -134522.690 -9205.580 -165206.967 -6491.573
Example      4 R2 -14381138955663548838848757760.000 coef: -258278207.000 10594077.424 -64384527889.036 0.000 -1413191220.824 -13882749836.211 -193742122542.820 -4892542877.195 -9280723651.388 -1650138110663.840 -46644048455.010 -920963891609.035 -30972328802.530
Example      8 R2 -22702562760396036842276782080.000 coef: -13611279226.270 10594077.424 -32469090356.021 0.000 -853300033.288 -3556563627.400 343294318175.594 -19441333652.354 -119756309767.644 -2347290036966.946 -59290460861.661 -918467320846.001 -62119498261.053
Example     16 R2 -23380239158061074778628816896.000 coef: -15264680678.199 -136439563013.287 109194758948.419 0.000 6733396884.820 55004370384.780 1138897188313.265 -6154145265.743