In [13]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [14]:
car_sales_extended = pd.read_csv("car-sales-extended.csv", sep='\t',header = 0)

In [15]:
car_sales_extended.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [16]:
## Check the length 
len(car_sales_extended)

1000

In [17]:
## Checking the data types 
car_sales_extended.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [18]:
car_sales_extended.head(1)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323


In [19]:
#Splitting the data into X and y
X = car_sales_extended.drop(['Price'] , axis = 1)
y = car_sales_extended['Price']

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,y_train , y_test = train_test_split(X , y , test_size=0.2)

In [22]:
# Choosing the type of machine learning algorithm to use 
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()

In [23]:
## Turning the categories or categorical numbers  into numbers to enhance better machine learning modelling 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make' , 'Colour' , 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot' , one_hot , categorical_features)],remainder='passthrough')

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [24]:
#Shwoing all the data is numerical 
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


## Use pandas get dummies later 

In [25]:
#Training the data using X_train , y_train 
# Retraining the data 
X_train , X_test , y_train , y_test = train_test_split(transformed_X , y , test_size= 0.2)

In [26]:
#Fitting the model 
clf.fit(X_train , y_train)

RandomForestRegressor()

In [27]:
clf.score(X_test , y_test)

0.2893672381211829

## What if the some important data was missing 


In [28]:
car_missing = pd.read_csv("car-sales-extended-missing-data.csv" , sep = '\t' , header = 0)

In [29]:
car_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [30]:
car_missing.tail(6)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
994,BMW,Blue,163322.0,3.0,31666.0
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0
999,Toyota,Blue,248360.0,4.0,12732.0


In [31]:
#finding how many Nan cells are present 
car_missing.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [32]:
car_missing.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0


In [33]:
#Splitting the data into X and y 
X = car_missing.drop(['Price'] , axis = 1)
y = car_missing["Price"]

## Because our data contains strings we have to convert it to numbers using enconders  

## But before that lets Fill in the missing values in Make , Doors , Odometer , color 

In [34]:
car_missing.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [35]:
#fill the missing values using fillna()
#filling the Make column 
car_missing.Make.fillna("missing" , inplace=True)

#Filling the Colour colum 
car_missing.Colour.fillna("missing" , inplace=True)

#filling the Doors Column
car_missing.Doors.fillna(4, inplace=True)

#filling the Odometer column
car_missing['Odometer (KM)'].fillna(car_missing['Odometer (KM)'].mean(), inplace=True)





In [36]:
car_missing.head(3)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0


In [37]:
#Dropna is for dropping any column that has an empty spot or NAn
car_missing.dropna(inplace=True)

In [38]:
car_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

## Converting all our data to numerical values using encoders after filling the missing values 

In [39]:
car_missing.head(1)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0


In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categories_features = ['Make' , 'Doors' , 'Colour']
One_hot = OneHotEncoder()
Transformer = ColumnTransformer([('One_hot' , One_hot , categories_features)] , remainder='passthrough')

Transformed_X = Transformer.fit_transform(car_missing)
Transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [41]:
pd.DataFrame(Transformed_X).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35431.0,15323.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,192714.0,19943.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,84714.0,28343.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,154365.0,13434.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,181577.0,14043.0


In [42]:
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# X_train , X_test , y_train , y_test = train_test_split(X ,y , test_size= 0.2)

In [43]:
car_missing = pd.read_csv("car-sales-extended-missing-data.csv" , sep ='\t' , header=0)

In [44]:
car_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [45]:
car_missing.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0
999,Toyota,Blue,248360.0,4.0,12732.0


In [46]:
car_missing.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [47]:
car_missing.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [48]:
car_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [49]:
## Removing the price column 
car_missing.dropna(subset = ['Price'] , inplace=True)

In [50]:
car_missing.isnull().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

## Will impute the missing values with Scikit Learn 


In [51]:
# Splitting the data into X and y
X = car_missing.drop(['Price'] , axis=1)
y = car_missing['Price']

In [52]:
car_missing.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0


In [53]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

#Categories
categories_imputer = SimpleImputer(strategy='constant' , fill_value='missing')
#Doors mostly have 4 doors 
doors_imputer = SimpleImputer(strategy='constant' , fill_value=4)

#Numerical missing columns are mostly filled with mean 
num_imputer = SimpleImputer(strategy= 'mean')

#Defining the list 
categories = ['Make' , 'Colour']

doors = ['Doors']

num = ['Odometer (KM)']

## Create an imputer called Imputer 
Imputer  = ColumnTransformer([
    ('categories_imputer' , categories_imputer , categories) ,
                             ('doors_imputer'  , doors_imputer  , doors),
                             ('num_imputer' , num_imputer , num)
                            ])

filled_X = Imputer.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [54]:
car_filled_data = pd.DataFrame(filled_X , columns=['Make' , 'Doors' ,'Colour', 'Odometer (KM)'])
car_filled_data

Unnamed: 0,Make,Doors,Colour,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0
...,...,...,...,...
945,Toyota,Black,4.0,35820.0
946,missing,White,3.0,155144.0
947,Nissan,Blue,4.0,66604.0
948,Honda,White,4.0,215883.0


## SO now applying now into numbers 

In [55]:
car_filled_data.head(1)

Unnamed: 0,Make,Doors,Colour,Odometer (KM)
0,Honda,White,4.0,35431.0


In [56]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

One_hot = OneHotEncoder()
categories_encode = ['Make' , 'Doors' , 'Colour']
Transformer_encode = ColumnTransformer([
    ('One_hot' , One_hot , categories_encode)  
] , remainder='passthrough')

transformed_en = Transformer_encode.fit_transform(car_filled_data)
transformed_en
                                                  


<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

## Now weve got the data filled using sklearn imputer and converted into numbers using column transformer and OneHot encode from sklearn

In [57]:
transformed_en

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [58]:
pd.DataFrame(transformed_en).head()

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
3,"(0, 3)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
4,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 11)\t1.0\n..."


In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train , X_test , y_train,y_test = train_test_split(transformed_en ,y , test_size=0.2)

In [60]:
clf = RandomForestRegressor()
clf.fit(X_train , y_train)

RandomForestRegressor()

In [61]:
print("The Coefficient of Determination of the model is {}%".format(clf.score(X_test , y_test) * 100))

The Coefficient of Determination of the model is 18.443193871652653%


## To Determine which estimator(Algorithm) to use to predict 

* Always follow the map to determine the right estimator to use
* Classification  = Prediciting whether its yes or no 
* Regression = Predicting a  number 

## loading the load_boston dataset from sklearn

##    from sklearn.datasets import fetch_california_housing
##        housing = fetch_california_housing()

In [62]:
from sklearn.datasets import load_boston

In [87]:
boston = load_boston()
boston;


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [64]:
#When you import a data from sklearn make the name('data') into a dataframe and columns = feature_names
boston_df = pd.DataFrame(boston['data'] , columns=boston['feature_names'])
boston_df['target'] = pd.Series(boston['target'])

In [65]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [66]:
X = boston_df.drop('target' , axis=1)
y =  boston['target']

In [67]:
# from sklearn.linear_model import Ridge
# from sklearn.model_selection import train_test_split

# X_train , X_test , y_train , y_test = train_test_split(X , y)

# model = Ridge()

# model.fit(X_train , y_train)

# model.score(X_test , y_test)



In [69]:
np.random.seed(42)
## Using a different algorithm or estimator to work on the same data 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

rfm = RandomForestRegressor()

X_train , X_test , y_train , y_test = train_test_split(X ,y)

rfm.fit(X_train , y_train)
rfm.score(X_test , y_test)

0.8471696005277883

In [70]:
y_pred = rfm.predict(X_test)

In [71]:
y_pred[:10]

array([23.063, 30.617, 16.498, 23.671, 16.499, 21.311, 19.515, 15.874,
       21.201, 21.18 ])

In [72]:
y_test[:10]

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [73]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_pred , y_test)

2.123362204724411

## Predicting the `predict.proba` and `predict`

In [78]:
#Predicting using Score
rfm.score(X_test , y_test)

0.8471696005277883

In [82]:
pred  = rfm.predict(X_test[:10])

In [83]:
y_test[:10]

array([23.6, 32.4, 13.6, 22.8, 16.1, 20. , 17.8, 14. , 19.6, 16.8])

In [85]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test , y_pred)

2.123362204724411