### Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the dataset

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
...,...,...,...,...,...,...
848830,848830,1991-09-30 11:40:00,2,3,NB,54
848831,848831,1991-09-30 11:40:00,2,3,NE,28
848832,848832,1991-09-30 11:40:00,2,3,SB,68
848833,848833,1991-09-30 11:40:00,2,3,SW,17


### Lets see if our dataset has any null values

In [4]:
df.isnull().sum()

row_id        0
time          0
x             0
y             0
direction     0
congestion    0
dtype: int64

- we can see our dataset has no null values so we can proceed forward 

### Taking a closer look at our data

In [5]:
# first 5 rows

df.head()

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60


In [6]:
# Columns, Number of Non-Null values per column and Data Type.

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 848835 entries, 0 to 848834
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   row_id      848835 non-null  int64 
 1   time        848835 non-null  object
 2   x           848835 non-null  int64 
 3   y           848835 non-null  int64 
 4   direction   848835 non-null  object
 5   congestion  848835 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 38.9+ MB


In [7]:
# Number of non-empty values, Mean, Standard Deviation, min, max, 25/50/75 percentiles.

df.describe()

Unnamed: 0,row_id,x,y,congestion
count,848835.0,848835.0,848835.0,848835.0
mean,424417.0,1.138462,1.630769,47.815305
std,245037.70221,0.801478,1.089379,16.799392
min,0.0,0.0,0.0,0.0
25%,212208.5,0.0,1.0,35.0
50%,424417.0,1.0,2.0,47.0
75%,636625.5,2.0,3.0,60.0
max,848834.0,2.0,3.0,100.0


In [8]:
# Types of column we have for the data

df.columns

Index(['row_id', 'time', 'x', 'y', 'direction', 'congestion'], dtype='object')

### The different features of our Dataset are:
- Row id - index number
- Time - date and time stamp of the vehicle
- x & y - Lanes
- Congestion - Number of vehicles

### Feature Engineering

In [9]:
df_copy1 = df

In [10]:
df_copy1

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
...,...,...,...,...,...,...
848830,848830,1991-09-30 11:40:00,2,3,NB,54
848831,848831,1991-09-30 11:40:00,2,3,NE,28
848832,848832,1991-09-30 11:40:00,2,3,SB,68
848833,848833,1991-09-30 11:40:00,2,3,SW,17


In [11]:
df_copy1["time"]= pd.to_datetime(df_copy1["time"])

In [12]:
df_copy1

Unnamed: 0,row_id,time,x,y,direction,congestion
0,0,1991-04-01 00:00:00,0,0,EB,70
1,1,1991-04-01 00:00:00,0,0,NB,49
2,2,1991-04-01 00:00:00,0,0,SB,24
3,3,1991-04-01 00:00:00,0,1,EB,18
4,4,1991-04-01 00:00:00,0,1,NB,60
...,...,...,...,...,...,...
848830,848830,1991-09-30 11:40:00,2,3,NB,54
848831,848831,1991-09-30 11:40:00,2,3,NE,28
848832,848832,1991-09-30 11:40:00,2,3,SB,68
848833,848833,1991-09-30 11:40:00,2,3,SW,17


In [13]:
# dropping row_id column

df_copy1 = df_copy1.drop(["row_id"], axis=1)

In [14]:
# Adding features using time column

df_copy1["Year"]= df_copy1['time'].dt.year
df_copy1["Month"]= df_copy1['time'].dt.month
df_copy1["Date_no"]= df_copy1['time'].dt.day
df_copy1["Hour"]= df_copy1['time'].dt.hour
df_copy1["Day"]= df_copy1.time.dt.strftime("%A")

In [15]:
df_copy1.head()

Unnamed: 0,time,x,y,direction,congestion,Year,Month,Date_no,Hour,Day
0,1991-04-01,0,0,EB,70,1991,4,1,0,Monday
1,1991-04-01,0,0,NB,49,1991,4,1,0,Monday
2,1991-04-01,0,0,SB,24,1991,4,1,0,Monday
3,1991-04-01,0,1,EB,18,1991,4,1,0,Monday
4,1991-04-01,0,1,NB,60,1991,4,1,0,Monday


### Encoding the dataset

In [16]:
x = df_copy1
x = x.drop(columns=['time', 'congestion'])

In [17]:
x

Unnamed: 0,x,y,direction,Year,Month,Date_no,Hour,Day
0,0,0,EB,1991,4,1,0,Monday
1,0,0,NB,1991,4,1,0,Monday
2,0,0,SB,1991,4,1,0,Monday
3,0,1,EB,1991,4,1,0,Monday
4,0,1,NB,1991,4,1,0,Monday
...,...,...,...,...,...,...,...,...
848830,2,3,NB,1991,9,30,11,Monday
848831,2,3,NE,1991,9,30,11,Monday
848832,2,3,SB,1991,9,30,11,Monday
848833,2,3,SW,1991,9,30,11,Monday


In [18]:
y = df_copy1.iloc[:,4]

In [19]:
y

0         70
1         49
2         24
3         18
4         60
          ..
848830    54
848831    28
848832    68
848833    17
848834    24
Name: congestion, Length: 848835, dtype: int64

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
x['direction'] = LabelEncoder().fit_transform(x['direction'])
x['Day'] = LabelEncoder().fit_transform(x['Day'])

In [22]:
x

Unnamed: 0,x,y,direction,Year,Month,Date_no,Hour,Day
0,0,0,0,1991,4,1,0,1
1,0,0,1,1991,4,1,0,1
2,0,0,4,1991,4,1,0,1
3,0,1,0,1991,4,1,0,1
4,0,1,1,1991,4,1,0,1
...,...,...,...,...,...,...,...,...
848830,2,3,1,1991,9,30,11,1
848831,2,3,2,1991,9,30,11,1
848832,2,3,4,1991,9,30,11,1
848833,2,3,6,1991,9,30,11,1


In [23]:
# Making copies of datasets for different algorithms

x_svm = x.values
y_svm = y.values
x_dt = x.values
y_dt = y.values
x_rf = x.values
y_rf = y.values

# Support Vector Machines

In [24]:
# splitting the dataset into training and test sets

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_svm, y_svm, test_size = 0.25, random_state = 0)

In [25]:
print(x_train)

[[ 1  0  7 ... 22  0  1]
 [ 1  3  4 ... 24 21  6]
 [ 0  2  1 ... 25 22  5]
 ...
 [ 2  0  7 ... 26  7  0]
 [ 0  1  1 ...  3  9  6]
 [ 0  3  7 ...  5 13  6]]


In [26]:
print(y_train)

[38 64 52 ... 65 70 37]


In [27]:
print(x_test)

[[ 2  2  4 ... 27  2  2]
 [ 2  2  5 ... 22 18  1]
 [ 1  2  2 ... 23 17  5]
 ...
 [ 1  0  7 ... 28  1  5]
 [ 1  0  7 ... 24  7  2]
 [ 0  3  6 ... 28  7  2]]


In [28]:
print(y_test)

[33 60 57 ... 45 39 28]


### Feature Scaling

In [29]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [30]:
print(x_train)

[[-0.17210263 -1.49843972  1.49192382 ...  0.71769877 -1.65712078
  -0.99587567]
 [-0.17210263  1.25582671  0.30990541 ...  0.94519522  1.37498494
   1.50282116]
 [-1.41976828  0.3377379  -0.87211299 ...  1.05894345  1.51937093
   1.00308179]
 ...
 [ 1.07556302 -1.49843972  1.49192382 ...  1.17269168 -0.64641887
  -1.49561503]
 [-1.41976828 -0.58035091 -0.87211299 ... -1.44351754 -0.3576469
   1.50282116]
 [-1.41976828  1.25582671  1.49192382 ... -1.21602109  0.21989705
   1.50282116]]


In [31]:
print(x_test)

[[ 1.07556302  0.3377379   0.30990541 ...  1.28643991 -1.36834881
  -0.4961363 ]
 [ 1.07556302  0.3377379   0.70391155 ...  0.71769877  0.94182698
  -0.99587567]
 [-0.17210263  0.3377379  -0.47810686 ...  0.831447    0.79744099
   1.00308179]
 ...
 [-0.17210263 -1.49843972  1.49192382 ...  1.40018813 -1.5127348
   1.00308179]
 [-0.17210263 -1.49843972  1.49192382 ...  0.94519522 -0.64641887
  -0.4961363 ]
 [-1.41976828  1.25582671  1.09791768 ...  1.40018813 -0.64641887
  -0.4961363 ]]


### Training SVM model on training set

In [82]:
# from sklearn.svm import SVR
# classifier = SVC(kernel='linear',random_state=0)
# classifier.fit(x_train, y_train)

from sklearn.svm import SVR
regressor = SVR(kernel='rbf',max_iter = 500)
regressor.fit(x_train, y_train)



SVR(max_iter=500)

### Checking Mean Squared Error and Accuracy score

In [83]:
from sklearn.metrics import mean_squared_error
y_pred = regressor.predict(x_test)
mean_squared_error(y_test,y_pred)

283.69648761127536

In [84]:
regressor.score(x_test,y_test)

-0.0015565538699444659

# Random Forest Model

In [52]:
# Splitting the dataset into the Training set and Test set

x_train, x_test, y_train, y_test = train_test_split(x_rf, y_rf, test_size = 0.25, random_state = 0)

In [53]:
print(x_train)

[[ 1  0  7 ... 22  0  1]
 [ 1  3  4 ... 24 21  6]
 [ 0  2  1 ... 25 22  5]
 ...
 [ 2  0  7 ... 26  7  0]
 [ 0  1  1 ...  3  9  6]
 [ 0  3  7 ...  5 13  6]]


In [54]:
print(y_train)

[38 64 52 ... 65 70 37]


In [55]:
print(x_test)

[[ 2  2  4 ... 27  2  2]
 [ 2  2  5 ... 22 18  1]
 [ 1  2  2 ... 23 17  5]
 ...
 [ 1  0  7 ... 28  1  5]
 [ 1  0  7 ... 24  7  2]
 [ 0  3  6 ... 28  7  2]]


In [56]:
print(y_test)

[33 60 57 ... 45 39 28]


### Feature Scaling

In [57]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [58]:
print(x_train)

[[-0.17210263 -1.49843972  1.49192382 ...  0.71769877 -1.65712078
  -0.99587567]
 [-0.17210263  1.25582671  0.30990541 ...  0.94519522  1.37498494
   1.50282116]
 [-1.41976828  0.3377379  -0.87211299 ...  1.05894345  1.51937093
   1.00308179]
 ...
 [ 1.07556302 -1.49843972  1.49192382 ...  1.17269168 -0.64641887
  -1.49561503]
 [-1.41976828 -0.58035091 -0.87211299 ... -1.44351754 -0.3576469
   1.50282116]
 [-1.41976828  1.25582671  1.49192382 ... -1.21602109  0.21989705
   1.50282116]]


In [59]:
print(x_test)

[[ 1.07556302  0.3377379   0.30990541 ...  1.28643991 -1.36834881
  -0.4961363 ]
 [ 1.07556302  0.3377379   0.70391155 ...  0.71769877  0.94182698
  -0.99587567]
 [-0.17210263  0.3377379  -0.47810686 ...  0.831447    0.79744099
   1.00308179]
 ...
 [-0.17210263 -1.49843972  1.49192382 ...  1.40018813 -1.5127348
   1.00308179]
 [-0.17210263 -1.49843972  1.49192382 ...  0.94519522 -0.64641887
  -0.4961363 ]
 [-1.41976828  1.25582671  1.09791768 ...  1.40018813 -0.64641887
  -0.4961363 ]]


### Training the Random Forest Regressor model on the Training set

In [86]:
# from sklearn.ensemble import RandomForestClassifier
# classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
# classifier.fit(x_train, y_train)

from sklearn.ensemble import RandomForestRegressor
regressor2 = RandomForestRegressor(n_estimators = 20, random_state = 0)
regressor2.fit(x_train, y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

### Checking Mean Squared Error and Accuracy score

In [87]:
# from sklearn.metrics import confusion_matrix, accuracy_score
# y_pred = classifier.predict(x_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# accuracy_score(y_test, y_pred)

from sklearn.metrics import mean_squared_error
y_pred = regressor2.predict(x_test)
mean_squared_error(y_test,y_pred)

85.61575633551502

In [88]:
regressor2.score(x_test,y_test)

0.6977438015028874

# Decision Tree Classification

In [69]:
# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(x_dt, y_dt, test_size = 0.25, random_state = 0)

In [70]:
print(x_train)

[[ 1  0  7 ... 22  0  1]
 [ 1  3  4 ... 24 21  6]
 [ 0  2  1 ... 25 22  5]
 ...
 [ 2  0  7 ... 26  7  0]
 [ 0  1  1 ...  3  9  6]
 [ 0  3  7 ...  5 13  6]]


In [71]:
print(y_train)

[38 64 52 ... 65 70 37]


In [72]:
print(x_test)

[[ 2  2  4 ... 27  2  2]
 [ 2  2  5 ... 22 18  1]
 [ 1  2  2 ... 23 17  5]
 ...
 [ 1  0  7 ... 28  1  5]
 [ 1  0  7 ... 24  7  2]
 [ 0  3  6 ... 28  7  2]]


In [73]:
print(y_test)

[33 60 57 ... 45 39 28]


### Feature Scaling

In [75]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [76]:
print(x_train)

[[-0.17210263 -1.49843972  1.49192382 ...  0.71769877 -1.65712078
  -0.99587567]
 [-0.17210263  1.25582671  0.30990541 ...  0.94519522  1.37498494
   1.50282116]
 [-1.41976828  0.3377379  -0.87211299 ...  1.05894345  1.51937093
   1.00308179]
 ...
 [ 1.07556302 -1.49843972  1.49192382 ...  1.17269168 -0.64641887
  -1.49561503]
 [-1.41976828 -0.58035091 -0.87211299 ... -1.44351754 -0.3576469
   1.50282116]
 [-1.41976828  1.25582671  1.49192382 ... -1.21602109  0.21989705
   1.50282116]]


In [77]:
print(x_test)

[[ 1.07556302  0.3377379   0.30990541 ...  1.28643991 -1.36834881
  -0.4961363 ]
 [ 1.07556302  0.3377379   0.70391155 ...  0.71769877  0.94182698
  -0.99587567]
 [-0.17210263  0.3377379  -0.47810686 ...  0.831447    0.79744099
   1.00308179]
 ...
 [-0.17210263 -1.49843972  1.49192382 ...  1.40018813 -1.5127348
   1.00308179]
 [-0.17210263 -1.49843972  1.49192382 ...  0.94519522 -0.64641887
  -0.4961363 ]
 [-1.41976828  1.25582671  1.09791768 ...  1.40018813 -0.64641887
  -0.4961363 ]]


### Training the Decision Tree Regressor model on the Training set

In [78]:
from sklearn.tree import DecisionTreeRegressor
regressor3 = DecisionTreeRegressor(random_state = 0)
regressor3.fit(x_train, y_train)

DecisionTreeRegressor(random_state=0)

### Checking Mean Squared Error and Accuracy score

In [79]:
y_pred = regressor2.predict(x_test)
mean_squared_error(y_test,y_pred)

87.53835170995197

In [81]:
regressor3.score(x_test,y_test)

0.6356395764127143

### Comparison of Accuracy score of all 3 models on test set

- Support Vector Machine: -0.155%
- Random Forest Model: 69.774%
- Decision Tree Model: 63.563%