In [19]:
# Load libraries
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety','expected_output']

dataset = pandas.read_csv(url, names=names)

# Not Sure
#data = numpy.array(dataset)  #convert array to numpy type array
#x_train, x_test = train_test_split(data,test_size=0.20)

#Info on dataset
n_rows = dataset.shape[0]
n_features = dataset.shape[1] - 1
print( "Total number of rows: {}".format(n_rows))
print( "Number of features: {}".format(n_features))

# Extract feature (X) and target (y) columns
feature_cols = list(dataset.columns[:-1])  # all columns but last is target/label
target_col = dataset.columns[-1]  # last column is the target/label
print( "Feature column(s):-\n{}".format(feature_cols))
print( "Target column: {}".format(target_col))

X_all = dataset[feature_cols]  # feature values for all cars
y_all = dataset[target_col]  # corresponding targets/labels
print ("\nFeature values:-")
print( X_all.head())  # print the first 5 rows

# Preprocess feature columns
def preprocess_features(X):
    #pd refers to pandas library
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        #if col_data.dtype == object:
            #col_data = col_data.replace(['yes', 'no'], [1, 0])
        # Note: This should change the data type for yes/no columns to int

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col)  # e.g. 'safety' => 'safety_LOW', 'safety_MED', 'safety_HIGH'

        outX = outX.join(col_data)  # collect column(s) in output dataframe

    return outX
#prepare the dataset before processing it, non-numeric values need to be converted
X_all = preprocess_features(X_all)
print( "Processed feature columns ({}):-\n{}".format(len(X_all.columns), list(X_all.columns)))

# TODO: Then, select features (X) and corresponding labels (y) for the training and test sets
# Note: Shuffle the data or randomly select samples to avoid any bias due to ordering in the dataset
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, train_size=0.75)

print ("Training set: {} samples".format(X_train.shape[0]))
print ("Test set: {} samples".format(X_test.shape[0]))
# Note: Validation set can be extracted from training data


Total number of rows: 1728
Number of features: 6
Feature column(s):-
['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
Target column: expected_output

Feature values:-
  buying  maint doors persons lug_boot safety
0  vhigh  vhigh     2       2    small    low
1  vhigh  vhigh     2       2    small    med
2  vhigh  vhigh     2       2    small   high
3  vhigh  vhigh     2       2      med    low
4  vhigh  vhigh     2       2      med    med
Processed feature columns (21):-
['buying_high', 'buying_low', 'buying_med', 'buying_vhigh', 'maint_high', 'maint_low', 'maint_med', 'maint_vhigh', 'doors_2', 'doors_3', 'doors_4', 'doors_5more', 'persons_2', 'persons_4', 'persons_more', 'lug_boot_big', 'lug_boot_med', 'lug_boot_small', 'safety_high', 'safety_low', 'safety_med']
Training set: 1296 samples
Test set: 432 samples
