In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import resample

In [2]:
data = pd.read_csv('../00_Data/Integrated_data/Fully_integrated_data.csv')

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0.1,Unnamed: 0,Date,Total Traffic Volume,Base Interest Rate,Exchange Rate,Gross Domestic Product (GDP),Economic Growth Rate (Real GDP Growth Rate),Export price,Export growth rate,Import price,Import growth rate,Nationwide Consumer Price Index,Regular Unleaded Gasoline,Diesel Fuel for Automobiles,number of car ownership
0,0,2011-01-01,5466446.0,2.75,1121.5,328096.0,5.4,44465370.0,44.7,41981320.0,13.6,90.448,1814.57,1611.14,18409608.0
1,1,2011-01-02,5059821.0,2.75,1121.732258,328096.0,5.4,44269520.0,43.775943,41799130.0,13.521635,90.397806,1815.24,1611.79,18398782.0
2,2,2011-01-03,5631394.0,2.75,1121.964516,328096.0,5.4,44073670.0,42.851885,41616940.0,13.443269,90.347613,1816.15,1612.85,18387956.0
3,3,2011-01-04,5584995.0,2.75,1122.196774,328096.0,5.4,43877830.0,41.927828,41434750.0,13.364904,90.297419,1817.48,1614.02,18377130.0
4,4,2011-01-05,5742955.0,2.75,1122.429032,328096.0,5.4,43681980.0,41.003771,41252560.0,13.286539,90.247226,1818.53,1615.0,18366304.0


In [4]:
# Selecting features and target variable
X = data.drop(columns=['Date', 'Total Traffic Volume'])
y = data['Total Traffic Volume']

In [6]:
# Adding a small constant to avoid log(0)
X = X + 1e-6

# Applying log scaling to the features
X = X.apply(lambda x: np.log1p(x) if np.issubdtype(x.dtype, np.number) else x)

# Binarizing the target variable for classification
y = (y > y.median()).astype(int)  # Assuming we want to classify traffic rate into high (1) and low (0)

# Imputing missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Generating bootstrap samples
def generate_bootstrap_samples(X, y, n_samples=10):
    bootstrap_samples = []
    for _ in range(n_samples):
        X_resampled, y_resampled = resample(X, y, replace=True, random_state=42)
        bootstrap_samples.append((X_resampled, y_resampled))
    return bootstrap_samples

bootstrap_samples = generate_bootstrap_samples(X_imputed, y, n_samples=10)

# Displaying a few rows from each bootstrap sample
for i, (X_resampled, y_resampled) in enumerate(bootstrap_samples):
    print(f"Bootstrap Sample {i+1} - Features:")
    print(pd.DataFrame(X_resampled, columns=data.columns.drop(['Date', 'Total Traffic Volume'])).head())
    print(f"Bootstrap Sample {i+1} - Target:")
    print(pd.Series(y_resampled).head())
    print("\n")

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Implementing the Bagging Classifier with default parameters
bagging_clf = BaggingClassifier(n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

# Making predictions
y_pred = bagging_clf.predict(X_test)

# Evaluating the model using a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Printing results
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

# Printing bootstrap sample summaries
bootstrap_samples_summary = []

for i, (X_resampled, y_resampled) in enumerate(bootstrap_samples):
    sample_summary = {
        "Sample Number": i + 1,
        "Features Shape": X_resampled.shape,
        "Target Distribution": pd.Series(y_resampled).value_counts().to_dict()
    }
    bootstrap_samples_summary.append(sample_summary)

# Displaying the summary
bootstrap_samples_summary_df = pd.DataFrame(bootstrap_samples_summary)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Bootstrap Sample 1 - Features:
   Unnamed: 0  Base Interest Rate  Exchange Rate  \
0    2.048737            0.812158       2.083651   
1    2.223068            0.340369       2.081174   
2    2.201316            0.698932       2.087894   
3    1.966599            0.894830       2.085111   
4    2.240229            0.917921       2.101862   

   Gross Domestic Product (GDP)  Economic Growth Rate (Real GDP Growth Rate)  \
0                      2.626811                                     0.859076   
1                      2.650132                                     1.099106   
2                      2.644988                                     0.785692   
3                      2.623876                                     0.799207   
4                      2.651049                                     0.495827   

   Export price  Export growth rate  Import price  Import growth rate  \
0      2.928009            0.844423      2.928009            1.321483   
1      2.931202            1.