In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from pandas.api.types import is_string_dtype, is_numeric_dtype
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import math

In [2]:
df_raw = pd.read_csv('../input/Automobile_data.csv', low_memory=False)

In [3]:
df_raw.shape

(205, 26)

In [4]:
df_raw[:2]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500


In [5]:
# df_raw.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors',\
#                  'body-style','drive-wheels','engine-location','wheel-base','length','width',\
#                  'height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system',\
#                  'bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg',\
#                  'price']

In [6]:
df_raw[:5]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [7]:
# convert '?' to None
df_raw = df_raw.replace('?', np.nan)

Take a look at data type of each columns

In [8]:
# Extract all string-type columns
cols_str = []
for col in df_raw:
    if is_string_dtype(df_raw[col]):
        cols_str.append(col)
print(cols_str)

['normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']


In [9]:
# convert following columns to continuous variables based on data description
# normalized-losses, bore, stroke, horsepower, peak-rpm, price
cols = ["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm", "price"]
for col in cols:
    df_raw[col] = pd.to_numeric(df_raw[col], errors='raise')

Now, let us make all string type variables to categorical variables.

In [10]:
for col in df_raw:
    if is_string_dtype(df_raw[col]):
        df_raw[col] = df_raw[col].astype('category').cat.as_ordered()

Handle missing values: Basically speaking, we don't need to do anything for categorical variables, because pandas automatically convert NA to -1 for categorical variables. For continuous variables, we need to replace NA with mean or median. And create a col_NA column to indicate which row has NAs.

In [11]:
for col in df_raw:
    if is_numeric_dtype(df_raw[col]):
        col_vals = df_raw[col]
        if sum(col_vals.isnull()) != 0:
            df_raw[col+'_na'] = col_vals.isnull()
            df_raw[col] = col_vals.fillna(col_vals.median())

Convert categorical variables to their numberic representations.

In [12]:
for col in df_raw:
    if str(df_raw[col].dtype) == "category":
        df_raw[col] = df_raw[col].cat.codes + 1

## Predict continuous price

In [13]:
df_raw.shape

(205, 32)

In [14]:
X = df_raw.drop('price', axis=1)
y = df_raw['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 99)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((164, 31), (41, 31), (164,), (41,))

In [15]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [16]:
def rmse(preds, actuals):
    return math.sqrt(((preds-actuals)**2).mean())

In [17]:
[rmse(m.predict(X_train), y_train),rmse(m.predict(X_val), y_val),m.score(X_train, y_train), m.score(X_val, y_val)]

[1494.830748153449, 2906.50703831018, 0.96557741789529383, 0.81923739281971197]

## Predict categorical symboling

In [18]:
X = df_raw.drop("symboling", axis=1)
y = df_raw["symboling"].astype('category')
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 99)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((164, 31), (41, 31), (164,), (41,))

In [19]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
print(m.score(X_train, y_train))
print(m.score(X_val, y_val))

0.981707317073
0.756097560976


### Parameter Tuning

In [21]:
# Tune three parameters: n_estimators, min_samepls_leaf, and max_features
# It might take some 
numOfestimators = [1,5,10,15,20,25,30]
numOfleafs = [1, 3, 5, 10, 25]
numOffeatures = np.arange(0.1, 1.1, 0.1)
best_result = []
for numOfestimator in numOfestimators:
    for numOfleaf in numOfleafs:
        for numOffeature in numOffeatures:  
            result = [numOfestimator, numOfleaf, numOffeature]
            m = RandomForestClassifier(n_jobs=-1, n_estimators=numOfestimator,\
                                    min_samples_leaf=numOfleaf,\
                                    max_features=numOffeature)
            # print(result)
            m.fit(X_train, y_train)
            result.append(m.score(X_train, y_train))
            result.append(m.score(X_val, y_val))
            if len(best_result) == 0: best_result = result
            elif best_result[4] < result[4]: 
                print(result)
                best_result = result
print(best_result)

[1, 1, 0.20000000000000001, 0.8902439024390244, 0.68292682926829273]
[1, 1, 0.40000000000000002, 0.88414634146341464, 0.70731707317073167]
[1, 1, 0.5, 0.92682926829268297, 0.75609756097560976]
[5, 1, 0.5, 0.98780487804878048, 0.78048780487804881]
[10, 3, 0.5, 0.93292682926829273, 0.80487804878048785]
[10, 3, 0.70000000000000007, 0.93902439024390238, 0.82926829268292679]
[10, 3, 0.70000000000000007, 0.93902439024390238, 0.82926829268292679]
