In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

In [2]:
df = pd.read_csv('car-data.csv', names=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class'])
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df = df.drop(columns='persons')

In [4]:
df.shape

(1728, 6)

In [5]:
df.describe()

Unnamed: 0,buying,maint,doors,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,4
top,vhigh,vhigh,2,small,low,unacc
freq,432,432,432,576,576,1210


In [6]:
# transform categorical to numeric
buy_dict = {'vhigh': 4,'high': 3,'med': 2,'low':1}
maint_dict = {'vhigh': 4,'high': 3,'med': 2,'low':1}
doors_dict = {'2': 2,'3': 3,'4': 4,'5more':5}
lug_boot_dict = {'small': 1,'med': 2,'big': 3}
safety_dict = {'low': 1,'med': 2,'high': 3}
class_dict = {'unacc': 1,'acc': 2,'good': 3,'vgood':4}
df['buying'] = df['buying'].replace(buy_dict)
df['maint'] = df['maint'].replace(maint_dict)
df['doors'] = df['doors'].replace(doors_dict)
df['lug_boot'] = df['lug_boot'].replace(lug_boot_dict)
df['safety'] = df['safety'].replace(safety_dict)
df['class'] = df['class'].replace(class_dict)

In [7]:
df.head()

Unnamed: 0,buying,maint,doors,lug_boot,safety,class
0,4,4,2,1,1,1
1,4,4,2,1,2,1
2,4,4,2,1,3,1
3,4,4,2,2,1,1
4,4,4,2,2,2,1


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='buying'), df['buying'], test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 5), (346, 5), (1382,), (346,))

In [9]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
rf = RandomForestClassifier()
rf_cv = GridSearchCV(rf, param_grid=param_grid, cv=10, n_jobs=-1, scoring='accuracy')
rf_cv.fit(X_train,y_train)
print('Random Forest Accuracy:',rf_cv.best_score_)

Random Forest Accuracy: 0.3306902304243562


In [10]:
rf_cv.score(X_test, y_test)

0.36127167630057805

In [11]:
pred = rf_cv.predict([[maint_dict['high'], doors_dict['4'], lug_boot_dict['big'], safety_dict['high'], class_dict['good']]])[0]



In [12]:
buy_inv_dict = {v: k for k, v in buy_dict.items()}

In [13]:
f'Predicted buying price is {buy_inv_dict[pred]}'

'Predicted buying price is low'