In [14]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.utils import resample

In [15]:
data = pd.read_csv('../data/complete.csv')

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,date,GDP,Economic growth rate,Regular Gasoline,Diesel for automobiles,population,Car registration,Min wage,Min wage increase rate,City bus trunk line,...,Texi,Model texi,Consumer price,Interest rate,Number of employed,Export amount,Export growth rate,Import amount,Import growth rate,Traffic rate
0,2015-01-01,394273.0,2.4,1568.65,1383.99,10045027,3056588.0,5580,7.1,1200,...,3000,5000,94.643,0.02,516.65,45105389,-1.0,39283025,-12.2,5400081.5
1,2015-01-02,394273.0,2.4,1568.65,1383.99,10045027,3056588.0,5580,7.1,1200,...,3000,5000,94.643,0.02,516.65,45105389,-1.0,39283025,-12.2,5220255.0
2,2015-01-03,394273.0,2.4,1568.65,1383.99,10045027,3056588.0,5580,7.1,1200,...,3000,5000,94.643,0.02,516.65,45105389,-1.0,39283025,-12.2,4480084.5
3,2015-01-04,394273.0,2.4,1568.65,1383.99,10045027,3056588.0,5580,7.1,1200,...,3000,5000,94.643,0.02,516.65,45105389,-1.0,39283025,-12.2,5486275.0
4,2015-01-05,394273.0,2.4,1531.05,1351.25,10045027,3056588.0,5580,7.1,1200,...,3000,5000,94.643,0.02,516.65,45105389,-1.0,39283025,-12.2,5831397.0


In [16]:
# Selecting features and target variable
X = data.drop(columns=['date', 'Traffic rate'])
y = data['Traffic rate']

In [20]:
# Adding a small constant to avoid log(0)
X = X + 1e-6

# Applying log scaling to the features
X = X.apply(lambda x: np.log1p(x) if np.issubdtype(x.dtype, np.number) else x)

# Binarizing the target variable for classification
y = (y > y.median()).astype(int)  # Assuming we want to classify traffic rate into high (1) and low (0)

# Imputing missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Generating bootstrap samples
def generate_bootstrap_samples(X, y, n_samples=10):
    bootstrap_samples = []
    for _ in range(n_samples):
        X_resampled, y_resampled = resample(X, y, replace=True, random_state=42)
        bootstrap_samples.append((X_resampled, y_resampled))
    return bootstrap_samples

bootstrap_samples = generate_bootstrap_samples(X_imputed, y, n_samples=10)

# Displaying a few rows from each bootstrap sample
for i, (X_resampled, y_resampled) in enumerate(bootstrap_samples):
    print(f"Bootstrap Sample {i+1} - Features:")
    print(pd.DataFrame(X_resampled, columns=data.columns.drop(['date', 'Traffic rate'])).head())
    print(f"Bootstrap Sample {i+1} - Target:")
    print(pd.Series(y_resampled).head())
    print("\n")

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Implementing the Bagging Classifier with default parameters
bagging_clf = BaggingClassifier(n_estimators=10, random_state=42)
bagging_clf.fit(X_train, y_train)

# Making predictions
y_pred = bagging_clf.predict(X_test)

# Evaluating the model using a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Printing results
print("Confusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)

# Printing bootstrap sample summaries
bootstrap_samples_summary = []

for i, (X_resampled, y_resampled) in enumerate(bootstrap_samples):
    sample_summary = {
        "Sample Number": i + 1,
        "Features Shape": X_resampled.shape,
        "Target Distribution": pd.Series(y_resampled).value_counts().to_dict()
    }
    bootstrap_samples_summary.append(sample_summary)

# Displaying the summary
bootstrap_samples_summary_df = pd.DataFrame(bootstrap_samples_summary)

Bootstrap Sample 1 - Features:
        GDP  Economic growth rate  Regular Gasoline  Diesel for automobiles  \
0  0.831482              0.397353          0.762322                0.761244   
1  0.829531              0.478785          0.759250                0.756449   
2  0.830101              0.462064          0.760776                0.758402   
3  0.829439              0.482476          0.760231                0.757692   
4  0.829911              0.482476          0.760013                0.757385   

   population  Car registration  Min wage  Min wage increase rate  \
0    0.852105          0.844736  0.788389                0.534384   
1    0.852450          0.844566  0.782894                0.565288   
2    0.852376          0.844586  0.785033                0.617513   
3    0.852401          0.844586  0.785033                0.617513   
4    0.852409          0.844586  0.782894                0.565998   

   City bus trunk line  City bus wide area  ...   Subway      Texi  \
0        

  result = getattr(ufunc, method)(*inputs, **kwargs)
