In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston

In [2]:
data_bunch = load_boston()

In [3]:
print(data_bunch['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
data_bunch.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [5]:
X = pd.DataFrame(data_bunch['data'], columns = data_bunch['feature_names'])
X.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14


In [6]:
for col in X.columns:
    print(F"Column - {col} # of unique items is : {len(X[col].unique())} \n")

Column - CRIM # of unique items is : 504 

Column - ZN # of unique items is : 26 

Column - INDUS # of unique items is : 76 

Column - CHAS # of unique items is : 2 

Column - NOX # of unique items is : 81 

Column - RM # of unique items is : 446 

Column - AGE # of unique items is : 356 

Column - DIS # of unique items is : 412 

Column - RAD # of unique items is : 9 

Column - TAX # of unique items is : 66 

Column - PTRATIO # of unique items is : 46 

Column - B # of unique items is : 357 

Column - LSTAT # of unique items is : 455 



In [7]:
for col in ['RAD','CHAS']:
    X[col] = X[col].astype('int')
    X[col] = X[col].astype('category')

In [8]:
X['y'] = data_bunch['target']
X.head(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,y
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6


In [14]:
for col in X: print(F" '{col}' : '{X[col].dtype}',  \n ")

 'CRIM' : 'float64',  
 
 'ZN' : 'float64',  
 
 'INDUS' : 'float64',  
 
 'CHAS' : 'category',  
 
 'NOX' : 'float64',  
 
 'RM' : 'float64',  
 
 'AGE' : 'float64',  
 
 'DIS' : 'float64',  
 
 'RAD' : 'category',  
 
 'TAX' : 'float64',  
 
 'PTRATIO' : 'float64',  
 
 'B' : 'float64',  
 
 'LSTAT' : 'float64',  
 
 'y' : 'float64',  
 


In [15]:
X.to_csv('boston_house_prices.csv', index = False)