In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline

In [7]:
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()

In [24]:
X = data['data']
y = data['target']

In [30]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X,y)
pred = reg.predict(X)
pd.DataFrame({'Actual':y,'pred':pred})

Unnamed: 0,Actual,pred
0,4.526,4.131650
1,3.585,3.976606
2,3.521,3.676571
3,3.413,3.241598
4,3.422,2.413587
...,...,...
20635,0.781,0.133656
20636,0.771,0.553528
20637,0.923,0.171251
20638,0.847,0.319105


In [31]:
reg.coef_

array([ 4.36693293e-01,  9.43577803e-03, -1.07322041e-01,  6.45065694e-01,
       -3.97638942e-06, -3.78654265e-03, -4.21314378e-01, -4.34513755e-01])

In [32]:
reg.intercept_

-36.941920207184516

In [33]:
reg.score(X,y) # For regression problems it is the R2 value while for classification problems it is accuracy.

0.6062326851998051

In [35]:
from sklearn.ensemble import GradientBoostingRegressor
obj = GradientBoostingRegressor()
obj.fit(X,y)
y_pred = obj.predict(X)
pd.DataFrame({'Actual':y,'pred':y_pred})

Unnamed: 0,Actual,pred
0,4.526,4.264327
1,3.585,3.878645
2,3.521,3.920746
3,3.413,3.303439
4,3.422,2.612814
...,...,...
20635,0.781,0.636165
20636,0.771,0.908736
20637,0.923,0.636647
20638,0.847,0.747593


In [36]:
obj.score(X,y)

0.8033237500356991

# Standard Scaler

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_t = scaler.transform(X)
stats = np.vstack((X.mean(axis=0),X.var(axis=0),X_t.mean(axis=0),X_t.var(axis=0))).T
pd.DataFrame(stats,index=data['feature_names'],columns=['unscaled mean','unscaled variance','scaled mean','scaled variance'])

Unnamed: 0,unscaled mean,unscaled variance,scaled mean,scaled variance
MedInc,3.870671,3.609148,6.6097e-17,1.0
HouseAge,28.639486,158.3886,5.508083e-18,1.0
AveRooms,5.429,6.121236,6.6097e-17,1.0
AveBedrms,1.096675,0.2245806,-1.060306e-16,1.0
Population,1425.476744,1282408.0,-1.101617e-17,1.0
AveOccup,3.070655,107.8648,3.442552e-18,1.0
Latitude,35.631861,4.562072,-1.079584e-15,1.0
Longitude,-119.569704,4.013945,-8.526513e-15,1.0


# Pipeline

In [58]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
pipe = Pipeline([
    ('Scaler',StandardScaler()),
    ('Poly',PolynomialFeatures(degree=3)),
    ('Linear',LinearRegression())
])

In [59]:
pipe.named_steps

{'Scaler': StandardScaler(),
 'Poly': PolynomialFeatures(degree=3),
 'Linear': LinearRegression()}

In [60]:
pipe.fit(X,y) # 1. xt = scaler.fit_transform(X) 2. xt = poly.fit_transform(xt) 3. lin_reg.fit(xt,y)
y_pred = pipe.predict(X) # 1. xt = scaler.fit_transform(X) 2. xt = poly.fit_transform(xt) 3. lin_reg.predict(xt,y)
pd.DataFrame({'Actual':y,'pred':y_pred})

Unnamed: 0,Actual,pred
0,4.526,4.087953
1,3.585,4.235454
2,3.521,4.201542
3,3.413,3.396868
4,3.422,2.658212
...,...,...
20635,0.781,0.662607
20636,0.771,0.672557
20637,0.923,0.822924
20638,0.847,0.890361


In [61]:
pipe.score(X,y)

0.7385168108924949

# Feature-Union

In [63]:
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
scaler = StandardScaler()
pca = PCA(n_components=4)
selector = SelectKBest(f_regression,k=2)
pca_pipe = Pipeline([('Scaler',scaler),('PCA',pca)])
union = FeatureUnion([('pca_pipe',pca_pipe),('Selector',selector)])
pipe = Pipeline([('Union',union),('lin_reg',LinearRegression())])
pipe.fit(X,y)
y_pred = pipe.predict(X)
pd.DataFrame({'Actual':y,'Pred':y_pred})

Unnamed: 0,Actual,Pred
0,4.526,4.213849
1,3.585,4.002325
2,3.521,3.608752
3,3.413,3.229705
4,3.422,2.201045
...,...,...
20635,0.781,0.943583
20636,0.771,1.307205
20637,0.923,0.849467
20638,0.847,0.971044


In [64]:
pipe.score(X,y)

0.528813008876781

In [69]:
X.shape[-1], union.transform(X).shape[-1]

(8, 6)

In [71]:
np.percentile?