In [50]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#Scikit-Learn imports
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # To fill missing values
from sklearn.preprocessing import OneHotEncoder # To turn our categorical variables into numbers
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor # Our Model/Estimator
from sklearn.model_selection import train_test_split



#Import our data set
data =  pd.read_csv('./data/laptop_sales.csv')

In [51]:
data.head()

Unnamed: 0,Make,Colour,Usage (Hours),USB Ports,Price
0,Dell,Black,500.0,3.0,1200.0
1,HP,Silver,1200.0,4.0,950.0
2,Lenovo,Blue,800.0,3.0,1100.0
3,Apple,White,1500.0,2.0,1400.0
4,Asus,Black,300.0,4.0,850.0


In [52]:
# Check missing values
data.isna().sum()

Make              3
Colour            3
Usage (Hours)    10
USB Ports         6
Price            28
dtype: int64

In [53]:
# Drop the rows with no labels in  our target variable
data.dropna(subset=["Price"], inplace=True)
data.isna().sum()

Make              3
Colour            3
Usage (Hours)    10
USB Ports         6
Price             0
dtype: int64

In [54]:
# Split data into X & y
X = data.drop("Price", axis=1)
y = data["Price"]

We've dropped the rows with no labels and split our data into X and y, let's create a Pipeline() to fill the rest of the missing values, encode them if necessary (turn them into numbers) and fit a model to them.

Let's define categorical, door and numeric features. Then build transformer

We'll do the following with the Pipeline() class:

- Categorical transformer -  fill our categorical values with the value 'missing' and then one encode them.
- Ports  transformer -  fill the USB Ports column missing values with the value 4.
- A numeric transformer - fill the numeric column missing values with the mean of the rest of the column.
  

In [55]:
# Define categorical columns
categorical_features = ["Make", "Colour"]

# Create categorical transformer (imputes missing values, then encodes them)
categorical_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))                                         
])

# Define port feature
port_feature = ["USB Ports"]
# Create port transformer (fills all door missing values with 3)
port_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='constant', fill_value=3)),
])

# Define numeric features
numeric_features = ["Usage (Hours)"]
# Create a transformer for filling all missing numeric values with the mean
numeric_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean'))  
])

Let's combine Pipelines with `ColumnTransformer()`

In [56]:

# Create a column transformer which combines all of the other transformers 
preprocessor = ColumnTransformer(
    transformers=[
      # (name, transformer_to_use, features_to_use transform)
      ('categorical', categorical_transformer, categorical_features),
      ('port', port_transformer, port_feature),
      ('numerical', numeric_transformer, numeric_features)
])

Let's create a Pipeline() to preprocess and model our data with the `ColumnTransformer()` and `RandomForestRegressor()`.

In [57]:
# Create the preprocessing and modelling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor), # fill our missing data and will make sure it's all numbers
                        ('regressor', RandomForestRegressor())]) # this will model our data

Lets make our training data and test data.
Then fit the preprocessing and modelling Pipeline() on the training data

In [58]:
# Split data into train and teset sets
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the model on the training data 
# (note: when fit() is called with a Pipeline(), fit_transform() is used for transformers)
model.fit(X_train, y_train)

Let's evalueate the modelmon test data

In [59]:
model.score(X_test, y_test)

0.9482058333233976