In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Getting our data ready to be used with machine learning
    1. Split the data into features and labels (usually `X` & `y`)
    2. Filling (also called imputing) or disregarding missing values
    3. converting non numeric strings to numeric value (also called feature encoding)

In [2]:
cars = pd.read_csv("car-sales-extended.csv")

In [7]:
# Now we can extract our X & y
# for x remove feature and get labels
X = cars.drop("Price", axis=1)

# for x we need feature
y = cars["Price"]

In [9]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']

# instantioate one hot encoder
one_hot = OneHotEncoder()
transformer = ColumnTransformer([(
                                "one_hot",
                                 one_hot,
                                 categorical_features)],
                                 remainder="passthrough")

transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X)

ValueError: Input contains NaN

In [10]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

NameError: name 'transformed_X' is not defined

In [6]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.24732429525973876

# Handle missing data
1. fill them with some value (imputation)
2. remove the samples with missing data altogheter

In [11]:
# check with isna
missing_values = pd.read_csv("car-sales-extended.csv")
missing_values.isna().sum()

Make             0
Colour           3
Odometer (KM)    3
Doors            0
Price            0
dtype: int64

In [12]:
# try to fill the missing values

# fill colour
missing_values["Colour"].fillna("missing",inplace=True)
# fill Odometer
missing_values["Odometer (KM)"].fillna(missing_values["Odometer (KM)"].mean(),inplace=True)

In [17]:
# now data frame is clean
missing_values.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

# Choosing the right estimator/algorithm for our problem

scikit learn use estimator as another term for machine learning model or algorithm
1. Classification - predecting whetever a sample is one thing or another
2. Regression - predecting a number

<img src="https://scikit-learn.org/stable/_static/ml_map.png" />

### 2.1 Picking a machine learning model for regression problem

In [21]:
# loading boston house data set from sklearn
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [25]:
# make dataframe from dataset
boston_df = pd.DataFrame(boston['data'], columns=boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])

In [27]:
# lets try the ridge regression model
from sklearn.linear_model import Ridge

# setup random seed
np.random.seed(42)

# create data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# instantiate model
model = Ridge()
model.fit(X_train, y_train)

# get score
model.score(X_test, y_test)

0.6662221670168518

##### how to get better score

In [40]:
# lets try random forest regressor
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)

# we have data trained already
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

# score
rf.score(X_test, y_test)

0.8922527442109116