In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split as tts
from sklearn.ensemble import ExtraTreesRegressor as ETR


Load the data

In [None]:
data = list()
with open('data/traffic_data.txt','r') as file:
    for line in file.readlines():
        words = line[:-1].split(',')
        data.append(words)
df = pd.DataFrame(data,columns = ['Day of the week', 'time of the day', 'Opponent team', 'IsDodgersGame','Number of vehicles']) 
df = df.astype({'Number of vehicles':'int32'})
df.head()

Unnamed: 0,Day of the week,time of the day,Opponent team,IsDodgersGame,Number of vehicles
0,Tuesday,00:00,San Francisco,no,3
1,Tuesday,00:05,San Francisco,no,8
2,Tuesday,00:10,San Francisco,no,10
3,Tuesday,00:15,San Francisco,no,6
4,Tuesday,00:20,San Francisco,no,1


Convert Categorial features into numerical

In [None]:
cat_columns = df.select_dtypes(include = 'object').columns.tolist()
encoders = []
for cat in cat_columns:
    encoders.append(preprocessing.LabelEncoder())
    df[cat] = encoders[-1].fit_transform(df[cat])
df.head()

Unnamed: 0,Day of the week,time of the day,Opponent team,IsDodgersGame,Number of vehicles
0,5,0,13,0,3
1,5,1,13,0,8
2,5,2,13,0,10
3,5,3,13,0,6
4,5,4,13,0,1


Seperate Independent and Dependent variables

In [None]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values

Split Data into training and testing

In [None]:
x_train,x_test,y_train,y_test = tts(X,Y,test_size = 0.25 ,random_state = 5)

Build and train_model

In [None]:
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
model = ETR(**params).fit(x_train,y_train)

Check for model perfomance on Test data

In [None]:
y_pred = model.predict(x_test)
print('Accuracy :{:.2f}'.format(mean_absolute_error(y_test,y_pred)))
print("Score:{:.3f}".format(model.score(x_test,y_test)))

Accuracy :7.42
Score:0.516


Using the model to compute predictions

In [None]:
test_datapoint = ['Saturday', '10:20', 'Atlanta', 'no']
test_datapoint_encoded = []
for i,item in enumerate(test_datapoint):
    tmp = encoders[i].transform([test_datapoint[i]])
    test_datapoint_encoded.append(tmp)
test_datapoint_encoded = np.array(test_datapoint_encoded ).reshape((1,4))
 
#predict number of cars 
pred = model.predict(test_datapoint_encoded)[0]
pred

26.039223547000137