In [2]:
pip install seaborn



In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [4]:
pip install kagglehub



In [5]:
import kagglehub
import os

dataset_dir = kagglehub.dataset_download("bobaaayoung/trafficvolumedatacsv")

# Find the CSV file within the directory
for filename in os.listdir(dataset_dir):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(dataset_dir, filename)
        break  # Stop after finding the first CSV file

print("Path to CSV file:", csv_file_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/bobaaayoung/trafficvolumedatacsv?dataset_version_number=1...


100%|██████████| 532k/532k [00:00<00:00, 44.8MB/s]

Extracting files...
Path to CSV file: /root/.cache/kagglehub/datasets/bobaaayoung/trafficvolumedatacsv/versions/1/TrafficVolumeData.csv





In [6]:
df = pd.read_csv(csv_file_path)
df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33750 entries, 0 to 33749
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date_time            33750 non-null  object 
 1   is_holiday           43 non-null     object 
 2   air_pollution_index  33750 non-null  int64  
 3   humidity             33750 non-null  int64  
 4   wind_speed           33750 non-null  int64  
 5   wind_direction       33750 non-null  int64  
 6   visibility_in_miles  33750 non-null  int64  
 7   dew_point            33750 non-null  int64  
 8   temperature          33750 non-null  float64
 9   rain_p_h             33750 non-null  float64
 10  snow_p_h             33750 non-null  float64
 11  clouds_all           33750 non-null  int64  
 12  weather_type         33750 non-null  object 
 13  weather_description  33750 non-null  object 
 14  traffic_volume       33750 non-null  int64  
dtypes: float64(3), int64(8), object(4)
m

In [8]:
# Null values
null_values = df.isnull().sum()
print("Null values:\n", null_values)


Null values:
 date_time                  0
is_holiday             33707
air_pollution_index        0
humidity                   0
wind_speed                 0
wind_direction             0
visibility_in_miles        0
dew_point                  0
temperature                0
rain_p_h                   0
snow_p_h                   0
clouds_all                 0
weather_type               0
weather_description        0
traffic_volume             0
dtype: int64


In [9]:
# List out all the unique values in the is_holiday column
unique_values = df['is_holiday'].unique()
print("Unique values in is_holiday column:", unique_values)

Unique values in is_holiday column: [nan 'Columbus Day' 'Veterans Day' 'Thanksgiving Day' 'Christmas Day'
 'New Years Day' 'Washingtons Birthday' 'Memorial Day' 'Independence Day'
 'State Fair' 'Labor Day' 'Martin Luther King Jr Day']


In [10]:
# Replace the NaN in is_holiday to "No"
df['is_holiday'].fillna('No', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['is_holiday'].fillna('No', inplace=True)


In [11]:
# Null values
null_values = df.isnull().sum()
print("Null values:\n", null_values)

# Unique values
unique_values = df.nunique()
print("\nUnique values:\n", unique_values)

# Numerical columns
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
print("Numerical columns:", numerical_cols)

# Categorical columns
categorical_cols = df.select_dtypes(include=['category', 'object']).columns.tolist()
print("Categorical columns:", categorical_cols)

Null values:
 date_time              0
is_holiday             0
air_pollution_index    0
humidity               0
wind_speed             0
wind_direction         0
visibility_in_miles    0
dew_point              0
temperature            0
rain_p_h               0
snow_p_h               0
clouds_all             0
weather_type           0
weather_description    0
traffic_volume         0
dtype: int64

Unique values:
 date_time              28589
is_holiday                12
air_pollution_index      290
humidity                  88
wind_speed                17
wind_direction           361
visibility_in_miles        9
dew_point                  9
temperature             5611
rain_p_h                 353
snow_p_h                  12
clouds_all                60
weather_type              11
weather_description       38
traffic_volume          6462
dtype: int64
Numerical columns: ['air_pollution_index', 'humidity', 'wind_speed', 'wind_direction', 'visibility_in_miles', 'dew_point', 'temperatu

In [12]:
# Convert the date_time column to datetime object
df_update_1 = df.copy()
df_update_1['date_time'] = pd.to_datetime(df['date_time'])

# Split date_time into two columns, date and time
df_update_1['date'] = df_update_1['date_time'].dt.date
df_update_1['time'] = df_update_1['date_time'].dt.time

# Bring the date and time columns to the front of the dataframe
df_update_1 = df_update_1[['date_time','date', 'time'] + [col for col in df_update_1.columns if col not in ['date_time', 'date', 'time']]]


In [13]:
df_update_1.head()

Unnamed: 0,date_time,date,time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,2012-10-02,09:00:00,No,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,2012-10-02,10:00:00,No,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,2012-10-02,11:00:00,No,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,2012-10-02,12:00:00,No,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,2012-10-02,13:00:00,No,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918


In [14]:
# Create a function to categorize time into different periods
def categorize_time(time):
    if pd.isnull(time):
        return np.nan  # Handle missing values
    hour = time.hour
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Apply the function to the 'time' column
df_update_2 = df_update_1.copy()
df_update_2['time_of_day'] = df_update_2['time'].apply(categorize_time)

df_update_2 = df_update_2[['date_time','date', 'time','time_of_day'] + [col for col in df_update_1.columns if col not in ['date_time', 'date', 'time','time_of_day']]]


In [15]:
df_update_2.head()

Unnamed: 0,date_time,date,time,time_of_day,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,2012-10-02,09:00:00,Morning,No,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,2012-10-02,10:00:00,Morning,No,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,2012-10-02,11:00:00,Morning,No,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,2012-10-02,12:00:00,Afternoon,No,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,2012-10-02,13:00:00,Afternoon,No,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918


In [16]:
df_update_3 = df_update_2.copy()
# Convert 'date' column to datetime objects before using .dt accessor
df_update_3['date'] = pd.to_datetime(df_update_3['date'])
df_update_3['day_of_week'] = df_update_3['date'].dt.day_name()

# Bring day_of_week next to date
cols = list(df_update_3.columns)
cols.remove('day_of_week')
cols.insert(cols.index('date') + 1, 'day_of_week')
df_update_3 = df_update_3[cols]


In [17]:
df_update_3.head()

Unnamed: 0,date_time,date,day_of_week,time,time_of_day,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,2012-10-02,Tuesday,09:00:00,Morning,No,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,2012-10-02,Tuesday,10:00:00,Morning,No,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,2012-10-02,Tuesday,11:00:00,Morning,No,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,2012-10-02,Tuesday,12:00:00,Afternoon,No,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,2012-10-02,Tuesday,13:00:00,Afternoon,No,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918


In [18]:
pip install pandas



In [20]:
import pandas as pd
df=pd.read_csv("traffic_data_preprocessed.csv")

In [21]:
!pip install scikit-learn




In [22]:
# pip install scikit-learn
df=pd.read_csv("traffic_data_preprocessed.csv")

In [23]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Split features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize MLPRegressor with some hyperparameters
model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # two hidden layers with 100 and 50 neurons respectively
    activation='relu',             # activation function
    solver='adam',                 # optimization algorithm
    max_iter=500,                  # maximum number of iterations
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate the model
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))


R2 Score: 0.49305453732882565
MSE: 1.2426832873377829


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming df_scaled is defined and includes your features and a categorical target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Predict the target for test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.3977777777777778
Confusion Matrix:
 [[233  65   0  63 124   0]
 [ 91 597 149 134 609 165]
 [  0 285 211  33 339 128]
 [ 93  28   0 452 273   0]
 [151 291  67 292 784 286]
 [  0 203  90   1 105 408]]
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.48      0.44       485
           1       0.41      0.34      0.37      1745
           2       0.41      0.21      0.28       996
           3       0.46      0.53      0.50       846
           4       0.35      0.42      0.38      1871
           5       0.41      0.51      0.45       807

    accuracy                           0.40      6750
   macro avg       0.41      0.42      0.40      6750
weighted avg       0.40      0.40      0.39      6750



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming df_scaled is defined and includes your features and a categorical target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Decision Tree Classifier with some hyperparameters
model = DecisionTreeClassifier(
    max_depth=10,       # limits the depth of the tree to prevent overfitting
    min_samples_split=5,  # minimum samples required to split an internal node
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict the target for test data
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6365925925925926
Confusion Matrix:
 [[ 350    4    0   47   84    0]
 [  56 1072  164  129  247   77]
 [   0   93  627   77  129   70]
 [  95   23    4  618  106    0]
 [  89  120  120  253 1147  142]
 [   0   70  113    0  141  483]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.72      0.65       485
           1       0.78      0.61      0.69      1745
           2       0.61      0.63      0.62       996
           3       0.55      0.73      0.63       846
           4       0.62      0.61      0.62      1871
           5       0.63      0.60      0.61       807

    accuracy                           0.64      6750
   macro avg       0.63      0.65      0.64      6750
weighted avg       0.65      0.64      0.64      6750



In [26]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a deeper MLPRegressor
model = MLPRegressor(
    hidden_layer_sizes=(512, 256, 128),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    early_stopping=False,
    max_iter=1000,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate the model
print("Progressed MLP Regressor")
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))

Progressed MLP Regressor
R2 Score: 0.7203662751268893
MSE: 0.6854704935020329


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Reuse the same df
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Logistic Regression with higher C
model = LogisticRegression(
    C=10,
    solver='saga',
    max_iter=1000,
    random_state=42
)

# Train the model
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Progressed Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Progressed Logistic Regression
Accuracy: 0.3983703703703704
Confusion Matrix:
 [[231  65   0  63 126   0]
 [ 93 597 148 132 609 166]
 [  0 290 208  33 336 129]
 [ 92  28   0 457 269   0]
 [149 292  68 290 787 285]
 [  0 204  88   1 105 409]]
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.48      0.44       485
           1       0.40      0.34      0.37      1745
           2       0.41      0.21      0.28       996
           3       0.47      0.54      0.50       846
           4       0.35      0.42      0.38      1871
           5       0.41      0.51      0.46       807

    accuracy                           0.40      6750
   macro avg       0.41      0.42      0.40      6750
weighted avg       0.40      0.40      0.39      6750



In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Reuse the same df
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize fully grown Decision Tree
model = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate the model
print("Progressed Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Progressed Decision Tree Classifier
Accuracy: 0.814962962962963
Confusion Matrix:
 [[ 414   18    0   22   31    0]
 [  23 1404   87   51  126   54]
 [   0   66  804   15   68   43]
 [  31   44   16  691   64    0]
 [  30  123   71   71 1523   53]
 [   0   50   37    0   55  665]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.85      0.84       485
           1       0.82      0.80      0.81      1745
           2       0.79      0.81      0.80       996
           3       0.81      0.82      0.81       846
           4       0.82      0.81      0.81      1871
           5       0.82      0.82      0.82       807

    accuracy                           0.81      6750
   macro avg       0.82      0.82      0.82      6750
weighted avg       0.82      0.81      0.81      6750

