In [187]:
from sklearn import datasets, decomposition
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [188]:
data = datasets.california_housing.fetch_california_housing()
dframe = pd.DataFrame(data = data.data, columns=data.feature_names)

In [189]:
def aggregation(d):
    return d.aggregate({"var", "mean", "sum"})

print('1) Aggregation:' "\n")
print(aggregation(dframe))

1) Aggregation:

            MedInc       HouseAge       AveRooms     AveBedrms    Population  \
var       3.609323     158.396260       6.121533      0.224592  1.282470e+06   
sum   79890.649500  591119.000000  112054.554679  22635.375088  2.942184e+07   
mean      3.870671      28.639486       5.429000      1.096675  1.425477e+03   

          AveOccup       Latitude     Longitude  
var     107.870026       4.562293  4.014139e+00  
sum   63378.322491  735441.620000 -2.467919e+06  
mean      3.070655      35.631861 -1.195697e+02  


In [196]:
def sampling(d):
    return d.sample(n=10)

print(' 2) Sampling:' "\n")
print(sampling(dframe))

 2) Sampling:

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
2795   2.3250      30.0  6.908356   1.428571       861.0  2.320755     36.94   
13391  3.9755      11.0  4.802000   0.930000      1757.0  3.514000     34.09   
16638  3.2112      15.0  6.517308   1.353846      1309.0  2.517308     35.60   
16045  4.2562      40.0  5.747449   1.099490      1382.0  3.525510     37.71   
12341  5.7613      23.0  8.647399   1.624277       676.0  1.953757     33.84   
19721  2.1250      40.0  6.213235   1.139706       343.0  2.522059     39.14   
18017  1.8542      17.0  4.789116   1.193878       636.0  2.163265     37.26   
151    5.3509      47.0  5.650672   1.028791      1034.0  1.984645     37.81   
13164  4.1875       1.0  5.816358   1.032407      2061.0  3.180556     36.84   
10645  5.8371      30.0  5.482759   1.135279       724.0  1.920424     33.55   

       Longitude  Population/MedInc  
2795     -118.31         370.322581  
13391    -117.59         441

In [204]:
def dimensionalityReduction(d):
    pca = decomposition.PCA(n_components=2)
    pca.fit(d)
    columns = ['PCA-%i' % i for i in range(pca.n_components_)]
    return pd.DataFrame(pca.transform(dframe), columns=columns, index=dframe.index).head()


print('3) Dimensionality Reduction: ' "\n")
print(dimensionalityReduction(dframe))

3) Dimensionality Reduction: 

         PCA-0       PCA-1
0 -1175.392724  -69.659056
1   885.940554 -438.709523
2 -1000.364440  -92.125786
3  -932.141217  -81.057366
4  -911.393386  -37.186431


In [192]:
def featureSubsetSelection(d):
    return d[['HouseAge', 'Population', 'MedInc']][0:10]

print('4) Feature Subset Selection:' "\n")
print(featureSubsetSelection(dframe))

4) Feature Subset Selection:

   HouseAge  Population  MedInc
0      41.0       322.0  8.3252
1      21.0      2401.0  8.3014
2      52.0       496.0  7.2574
3      52.0       558.0  5.6431
4      52.0       565.0  3.8462
5      52.0       413.0  4.0368
6      52.0      1094.0  3.6591
7      52.0      1157.0  3.1200
8      42.0      1206.0  2.0804
9      52.0      1551.0  3.6912


In [193]:
def featureCreation(d):
    d['Population/MedInc'] = d['Population'] / d['MedInc']
    return d.head(10)

print('5) Feature Creation' "\n")
print(featureCreation(dframe))

5) Feature Creation

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
5  4.0368      52.0  4.761658   1.103627       413.0  2.139896     37.85   
6  3.6591      52.0  4.931907   0.951362      1094.0  2.128405     37.84   
7  3.1200      52.0  4.797527   1.061824      1157.0  1.788253     37.84   
8  2.0804      42.0  4.294118   1.117647      1206.0  2.026891     37.84   
9  3.6912      52.0  4.970588   0.990196      1551.0  2.172269     37.84   

   Longitude  Population/MedInc  
0    -122.23          38.677749  
1    -122.22         289.228323  
2    -122.24          68.344035  
3    -

In [194]:
def discretizationandBinarization(d):
    binary = d.HouseAge[0:20]
    return pd.get_dummies(binary)

print('6) Discretization and Binarization ' "\n")
print(discretizationandBinarization(dframe))

6) Discretization and Binarization 

    21.0  41.0  42.0  50.0  52.0
0      0     1     0     0     0
1      1     0     0     0     0
2      0     0     0     0     1
3      0     0     0     0     1
4      0     0     0     0     1
5      0     0     0     0     1
6      0     0     0     0     1
7      0     0     0     0     1
8      0     0     1     0     0
9      0     0     0     0     1
10     0     0     0     0     1
11     0     0     0     0     1
12     0     0     0     0     1
13     0     0     0     0     1
14     0     0     0     0     1
15     0     0     0     1     0
16     0     0     0     0     1
17     0     0     0     0     1
18     0     0     0     1     0
19     0     0     0     0     1


In [195]:
def attributeTransformation(d):
    d['HouseAge'] = d['HouseAge'].transform(lambda i: i - 5)
    return d.head()

print('7) Attribute Transformation ' "\n")
print(attributeTransformation(dframe))

7) Attribute Transformation 

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      36.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      16.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      47.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      47.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      47.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  Population/MedInc  
0    -122.23          38.677749  
1    -122.22         289.228323  
2    -122.24          68.344035  
3    -122.25          98.881820  
4    -122.25         146.898237  
