In [None]:
!python --version   # Python version

# About python:  https://www.python.org/
#                Python is powerful... and fast; plays well with others; runs everywhere; is friendly & easy to learn; 
#                is Open –> https://www.python.org/about/.
#     Python docs: https://docs.python.org/3/ (all documentation); 
#                  https://docs.python.org/3.7/ (Recommended version – 3.7). 
# The Python Tutorial (python3.7): https://docs.python.org/3.7/tutorial/index.html 

# Load Module ---
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns 
# NumPy : The fundamental package for scientific computing with Python. NumPy is the fundamental package for scientific 
#         computing in Python. It is a Python library that provides a multidimensional array object, various derived 
#         objects (such as masked arrays and matrices), and an assortment of routines for fast operations on arrays, 
#         including mathematical, logical, shape manipulation, sorting, selecting, I/O, discrete Fourier transforms,
#         basic linear algebra, basic statistical operations, random simulation and much more.
#     About: https://numpy.org/
#     Docs: https://numpy.org/doc/stable/
#     NumPy quickstart: https://numpy.org/doc/stable/user/quickstart.html

# Pandas: pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, 
#         built on top of the Python programming language.
#     About: https://pandas.pydata.org/ 
#     Docs: https://pandas.pydata.org/docs/ 
#     Getting started: https://pandas.pydata.org/docs/getting_started/index.html 
#     User Guide: https://pandas.pydata.org/docs/user_guide/index.html#user-guide 

# Seaborn: Seaborn is a Python data visualization library based on matplotlib. It provides a 
#          high-level interface for drawing attractive and informative statistical graphics.
#   About: https://seaborn.pydata.org/

print('numpy version:',np.__version__)
print('pandas version: ',pd.__version__)
print('seaborn version:',sns.__version__)
print('pyplot: ',plt)

Python 3.8.15
numpy version: 1.21.6
pandas version:  1.3.5
seaborn version: 0.11.2
pyplot:  <module 'matplotlib.pyplot' from '/usr/local/lib/python3.8/dist-packages/matplotlib/pyplot.py'>


# Toy dataset - [Boston House Price Dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html) - using sklearn.datasets.load_boston

**DEPRECATED**: load_boston is deprecated in 1.0 and will be removed in 1.2.

The Boston housing prices dataset has an ethical problem. You can refer to the documentation of this function for further details. The scikit-learn maintainers therefore strongly discourage the use of this dataset unless the purpose of the code is to study and educate about ethical issues in data science and machine learning.

Alternative - **[sklearn.datasets.fetch_california_housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html)**

Load the California housing dataset (regression).

### load dataset - California Housing Dataset

In [None]:
# load dataset loader - hiousing dataset 
from sklearn.datasets import fetch_california_housing

# load data 
housing_dataset=fetch_california_housing()

# print data info 
print(housing_dataset.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [None]:
# see shape of the data (inputs / X / independent)
print('Shape of data:',housing_dataset.data.shape)
# rows / Instances / samples - 150, columns / features / Attributes - 4
# columns name 
print('Feature names:',housing_dataset.feature_names)
# see target shape 
print('Target shape:',housing_dataset.target.shape) 
# see labels names 
print('Label names:',housing_dataset.target_names) 

Shape of data: (20640, 8)
Feature names: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
Target shape: (20640,)
Label names: ['MedHouseVal']


### convert to pandas dataframe 

In [None]:
# convert data to pandas dataframe 
housing_df=pd.DataFrame(data=housing_dataset.data,
                        columns=housing_dataset.feature_names)
# add targets (column)
housing_df['targets']=housing_dataset.target
# see head 
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,targets
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [None]:
# see tail 
housing_df.tail()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,targets
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847
20639,2.3886,16.0,5.254717,1.162264,1387.0,2.616981,39.37,-121.24,0.894


In [None]:
# see info 
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   targets     20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [None]:
# see stats 
housing_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,targets
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


### split dataset using [sklearn.model_selection.train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation and next(ShuffleSplit().split(X, y)) and application to input data into a single call for splitting (and optionally subsampling) data in a oneliner.

In [None]:
# split dataset loader 
from sklearn.model_selection import train_test_split 

# make split daatset 
Xtrain,Xtest,ytrain,ytest=train_test_split(housing_df.drop(columns='targets'),
                                           housing_df['targets'],
                                           test_size=0.33)

# see shape of outputs 
Xtrain.shape,ytrain.shape,Xtest.shape,ytest.shape

((13828, 8), (13828,), (6812, 8), (6812,))

### Regression with [Nearest Neighbors](https://scikit-learn.org/stable/modules/neighbors.html) (using [sklearn.neighbors.KNeighborsRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html) <sup></sup> )

In [None]:
# maximum value of k 
int(np.sqrt(Xtrain.shape[0]))

117

In [None]:
# load K Near Neighbors Regressor
from sklearn.neighbors import KNeighborsRegressor

# make list to store model score 
score=list()

# make a loop to calculate / find best value of k 
for k in range(1,int(np.sqrt(Xtrain.shape[0]))+1):
  # make model 
  kRegr=KNeighborsRegressor(n_neighbors=k)
  # train model 
  kRegr.fit(Xtrain,ytrain)
  # calaculate model score 
  score.append(kRegr.score(Xtest,ytest))

# make print maximum score 
print('Maximum score:',max(score),'; at:',np.argmax(score)+1)
# make print minimum score 
print('Minimum score:',min(score),'; at:',np.argmin(score)+1)
# make print average score 
print('Average score:',np.mean(score))

Maximum score: 0.136178009017366 ; at: 8
Minimum score: -0.23287546610590493 ; at: 1
Average score: 0.061633801068948224


### Regression with [Support Vector Machines](https://scikit-learn.org/stable/modules/svm.html) (using [sklearn.svm.SVR](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html) <sup></sup> )

In [None]:
# load Grid Search Cross Validation
from sklearn.model_selection import GridSearchCV 
# load model Support Vector Regression 
from sklearn.svm import SVR 

# make hyperparameter dictionary 
hyperparameters={'kernel':['linear','poly','rbf','sigmoid'],
                 'degree':[3,5,7],'gamma':['scale','auto'],
                  'coef0':[0.0,0.5,1.0],'C':[0.1,0.5,1.0]}

# make grid serch cv model 
SVR_serach=GridSearchCV(SVR(),hyperparameters,cv=None,verbose=1,scoring='r2')
# make fit data 
SVR_serach.fit(Xtrain,ytrain)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [None]:
# make print best score 
print('Best R2 score:',SVR_serach.best_score_)
# see best model 
print('Best estimater:',SVR_serach.best_estimator_)
# see best hyperparameter 
print('Best hyperparmeters:',SVR_serach.best_params_)

# convert Grid Search CV results to pandas dataframe (from SVR_serach.cv_results_ (dictionary))
SVR_serach_results_df_full=pd.DataFrame(data=SVR_serach.cv_results_)
# select only required columns 
SVR_serach_results_df=SVR_serach_results_df_full[['param_kernel','param_degree','param_gamma','param_coef0','param_C',
                                                  'mean_test_score','rank_test_score']]
# make sort results by "test score"
SVR_serach_results_df=SVR_serach_results_df.sort_values('rank_test_score',inplace=False)# warning at inplace=True
# see head
SVR_serach_results_df.head()