In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.request 
%matplotlib inline
from sklearn.impute import *
from matplotlib.pyplot import axis
from sklearn.impute import *
from sklearn.datasets import *
from sklearn.linear_model import *
from sklearn.model_selection import *

X, y = fetch_california_housing(return_X_y=True)

data_path = 'https://raw.githubusercontent.com/nyandwi/public_datasets/master/housing.csv'

def download_read_data(path):
    
    """
     Function to retrieve data from the data paths
     And to read the data as a pandas dataframe
  
    To return the dataframe
    """ 
    
      ## Only retrieve the directory of the data

    data_path =  urllib.request.urlretrieve(path)[0]
    data = pd.read_csv(path)
    
    return data

cal_data = download_read_data(data_path)
train_data, test_data = train_test_split(cal_data, test_size=0.1,random_state=20)
cal_train = train_data.copy()
correlation = train_data.corr()
correlation['median_house_value']

training_input_data = train_data.drop('median_house_value', axis=1)
training_labels = train_data['median_house_value']


  correlation = train_data.corr()


In [18]:
#Categorical features are features which have categorical values. An example in our dataset is ocean_proximity that has the following values
training_input_data['ocean_proximity'].value_counts()

<1H OCEAN     8231
INLAND        5896
NEAR OCEAN    2384
NEAR BAY      2061
ISLAND           4
Name: ocean_proximity, dtype: int64

In [19]:
# we have 5 categories: <1H OCEAN, INLAND, NEAR OCEAN, NEAR BAY, ISLAND
#We will be looking 3 techniques which are simple Python Mapping, Ordinary Encoding, and One Hot Encoding
#mapping
#Mapping is simple. We create a dictionary of categorical values and their corresponding numerics. And after that, we map it to the categorical feature.
cat_feats = training_input_data['ocean_proximity']
cat_feats.value_counts()

<1H OCEAN     8231
INLAND        5896
NEAR OCEAN    2384
NEAR BAY      2061
ISLAND           4
Name: ocean_proximity, dtype: int64

In [20]:
#Normalizing numerical features
from sklearn.preprocessing import *
num_feats = training_input_data.drop('ocean_proximity', axis=1)



scaler = MinMaxScaler()
num_scaled = scaler.fit_transform(num_feats)
num_scaled

array([[0.62271805, 0.13390011, 0.78431373, ..., 0.03676084, 0.04555172,
        0.19157667],
       [0.29513185, 0.4218916 , 0.58823529, ..., 0.05129013, 0.05558296,
        0.18772155],
       [0.18965517, 0.53666312, 0.54901961, ..., 0.09736372, 0.1516198 ,
        0.3378712 ],
       ...,
       [0.74340771, 0.02763018, 0.58823529, ..., 0.03942163, 0.07383654,
        0.27298934],
       [0.61663286, 0.16578108, 0.78431373, ..., 0.04764906, 0.11609933,
        0.36398808],
       [0.19269777, 0.55791711, 0.88235294, ..., 0.02653783, 0.07432988,
        0.22199004]])

In [21]:
#Creating a numerical features pipeline
from sklearn.pipeline import *
num_feats_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])
num_feats_preprocessed = num_feats_pipe.fit_transform(num_feats)
num_feats_preprocessed
#With only just handful of codes, we made a pipeline which can take numerical features, impute the missing values, and scale the features as well

array([[ 0.67858615, -0.85796668,  0.97899282, ..., -0.33416821,
        -0.58313172, -0.31168387],
       [-0.93598814,  0.41242353,  0.18557502, ...,  0.04124236,
        -0.42237836, -0.34110223],
       [-1.45585107,  0.9187045 ,  0.02689146, ...,  1.23170093,
         1.11663747,  0.80468775],
       ...,
       [ 1.27342931, -1.32674535,  0.18557502, ..., -0.26541833,
        -0.12985994,  0.30957512],
       [ 0.64859406, -0.71733307,  0.97899282, ..., -0.05283644,
         0.54741244,  1.00398532],
       [-1.44085502,  1.01246024,  1.37570172, ..., -0.59831252,
        -0.12195403, -0.07959982]])

In [25]:
#One Hot Encoding
#One hot encoding is most preferred when the categories are not in any order and that is exactly how our categorical feature is
#This is what I mean by saying unordered categories: If you have 3 cities and encode them with numbers (1,2,3) respectively, a machine learning model may learn that city 1 is close to city 2 and to city 3
#On the flip side, if you have the feature of ordered ranges like low, medium, and high, then numbers can be an effective way because you want to keep the sequence of these ranges.
#In our case, the ocean proximity feature is not in any order. By using one hot, The categories will be converted into binary representation (1s or 0s), and the orginal categorical feature will be splitted into more features, equivalent to the number of categories.
from sklearn.preprocessing import OneHotEncoder

def one_hot(input_data):
  one_hot_encoder = OneHotEncoder()
  output = one_hot_encoder.fit_transform(input_data)
  
  # The output of one hot encoder is a sparse matrix. 
  # It's best to convert it into numpy array 
  output = output.toarray()

  return output
cat_feats = training_input_data[['ocean_proximity']]
cat_feats_hot = one_hot(cat_feats)
cat_feats_hot

array([[0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.]])

In [26]:
#Pipeline for transforming Categorical Features
#Even if we are only encoding categorical features, let's still use a pipeline for identity. And if you had more preprocessing functions to be done on the categorical features, you can add them in the pipeline.
cat_feats_pipe = Pipeline([
     ('encoder', OneHotEncoder())                      
])

cat_feats_preprocessed = cat_feats_pipe.fit_transform(cat_feats)
##Like we saw early, the output of one hot is a sparse matrix.

In [27]:
from sklearn.compose import *
#The Transformer required lists of features

num_list = list(num_feats)
cat_list = list(cat_feats)
final_pip = ColumnTransformer([
    ('num', num_feats_pipe, num_list),    
    ('cat', cat_feats_pipe, cat_list) 
])
training_data_preprocessed = final_pip.fit_transform(training_input_data)
training_data_preprocessed

array([[ 0.67858615, -0.85796668,  0.97899282, ...,  0.        ,
         0.        ,  1.        ],
       [-0.93598814,  0.41242353,  0.18557502, ...,  0.        ,
         0.        ,  0.        ],
       [-1.45585107,  0.9187045 ,  0.02689146, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.27342931, -1.32674535,  0.18557502, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.64859406, -0.71733307,  0.97899282, ...,  0.        ,
         0.        ,  0.        ],
       [-1.44085502,  1.01246024,  1.37570172, ...,  0.        ,
         1.        ,  0.        ]])

In [28]:
#We will use Linear Regression model which is available in sklearn.
from sklearn.linear_model import LinearRegression

reg_model = LinearRegression()

In [29]:
#After we have created the model, we fit on the input training data and output labels.
reg_model.fit(training_data_preprocessed, training_labels)

In [30]:
# machine learning is only 5% or so of the whole ML project, the rest percentage being for data processing
#Great, that was fast! The model is now trained on the training set
#let's take things little deep.
#weights and bias? These are two paremeters of any typical ML model. It is possible to access the model paremeters, here is how.
#Coef or coefficients are feferred to as weights
reg_model.coef_

array([-5.57624687e+04, -5.70484452e+04,  1.31317374e+04, -7.80834727e+03,
        2.77116096e+04, -5.07070403e+04,  3.55449252e+04,  7.27323876e+04,
        1.85839106e+17,  1.85839106e+17,  1.85839106e+17,  1.85839106e+17,
        1.85839106e+17])

In [31]:
# Intercept is what can be compared to the bias
reg_model.intercept_

-1.8583910608472304e+17

In [None]:
#one thing to note is that each coefficient correspond to each feature, and the biases term is added as a constant
# These are what makes up a model, and in our case, it is a linear equation, hence a linear model
#model = Coeff 0 Feature 0 + Coeff 1 Feature 1 + .......Coeff 13 Feature 13 + bias*
#simplest form of a linear equation is y=ax+b. a stands for the coefficient/weight and b stands for intercept/bias
