In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load and preprocess the dataset
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering: Adding traffic volume data for previous hours
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

# Convert holiday to binary (fix the issue here)
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Extract date-time features
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['weekday'] = data['date_time'].dt.weekday
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Encode weather conditions using one-hot encoding for safer feature handling
n1features = ['Rain', 'Clouds', 'Clear', 'Snow', 'Mist', 'Drizzle', 'Haze', 'Thunderstorm', 'Fog', 'Smoke', 'Squall']
n2features = ['light rain', 'few clouds', 'Sky is Clear', 'light snow', 'sky is clear', 'mist', 'broken clouds', 'moderate rain']

data['weather_type'] = data['weather_type'].apply(lambda wt: n1features.index(wt) + 1 if wt in n1features else 0)
data['weather_description'] = data['weather_description'].apply(lambda wd: n2features.index(wd) + 1 if wd in n2features else 0)

# Define traffic categories based on quantiles
low_threshold = data['traffic_volume'].quantile(0.25)
high_threshold = data['traffic_volume'].quantile(0.75)

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare Features and Target
features = ['is_holiday', 'temperature', 'weekday', 'hour', 'month_day', 'year', 'month', 'weather_type', 'weather_description']
X = data[features]
y = data['traffic_category']

# Scaling Features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)

# Encoding Target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Initialize the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Hyperparameter Tuning Function
def tune_hyperparameters(model, params, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Tune Random Forest
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
best_rf = tune_hyperparameters(models['Random Forest'], rf_params, X_train, y_train)

# Tune Gradient Boosting
gb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_gb = tune_hyperparameters(models['Gradient Boosting'], gb_params, X_train, y_train)

# Tune XGBoost
xgb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_xgb = tune_hyperparameters(models['XGBoost'], xgb_params, X_train, y_train)


In [3]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

# Stacking Model
stacking_model = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    final_estimator=LogisticRegression()
)

# Evaluation Function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

    # Cross-validation score
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy').mean()

    return accuracy, conf_matrix, class_report, cv_score

# Evaluate Models
models_to_evaluate = {
    'Best Random Forest': best_rf,
    'Best Gradient Boosting': best_gb,
    'Best XGBoost': best_xgb,
    'Stacked Model': stacking_model
}

for model_name, model in models_to_evaluate.items():
    accuracy, conf_matrix, class_report, cv_score = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")
    print(f"Cross-Validation Score: {cv_score}")



Best Random Forest:
Accuracy: 0.8888888888888888
Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [1 0 6]]
Classification Report:
                precision    recall  f1-score   support

  High Traffic       0.50      1.00      0.67         1
   Low Traffic       1.00      1.00      1.00         1
Medium Traffic       1.00      0.86      0.92         7

      accuracy                           0.89         9
     macro avg       0.83      0.95      0.86         9
  weighted avg       0.94      0.89      0.90         9

Cross-Validation Score: 0.6095238095238096

Best Gradient Boosting:
Accuracy: 0.8888888888888888
Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [1 0 6]]
Classification Report:
                precision    recall  f1-score   support

  High Traffic       0.50      1.00      0.67         1
   Low Traffic       1.00      1.00      1.00         1
Medium Traffic       1.00      0.86      0.92         7

      accuracy                           0.89         9
     macro avg       0.83      0.95   

In [4]:
import joblib

# Save the Stacked Model
joblib.dump(stacking_model, 'best_traffic_model.pkl')
print("Best model saved as 'best_traffic_model.pkl'")


Best model saved as 'best_traffic_model.pkl'


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Load and preprocess the dataset
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering: Adding traffic volume data for previous hours (if needed)
# This can be omitted as we're focusing on your selected nine features
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

# Convert holiday to binary
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Extract date-time features
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['weekday'] = data['date_time'].dt.weekday
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Encode weather conditions (ordinal or one-hot encoding if necessary)
n1features = ['Rain', 'Clouds', 'Clear', 'Snow', 'Mist', 'Drizzle', 'Haze', 'Thunderstorm', 'Fog', 'Smoke', 'Squall']
n2features = ['light rain', 'few clouds', 'Sky is Clear', 'light snow', 'sky is clear', 'mist', 'broken clouds', 'moderate rain']

data['weather_type'] = data['weather_type'].apply(lambda wt: n1features.index(wt) + 1 if wt in n1features else 0)
data['weather_description'] = data['weather_description'].apply(lambda wd: n2features.index(wd) + 1 if wd in n2features else 0)

# Define traffic categories based on quantiles
low_threshold = data['traffic_volume'].quantile(0.25)
high_threshold = data['traffic_volume'].quantile(0.75)

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Select only the nine features for training
features = ['weather_type', 'weather_description', 'hour', 'month_day', 'weekday', 'month', 'year', 'is_holiday', 'temperature']
X = data[features]
y = data['traffic_category']

# Scaling the Features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)

# Encode the Target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Initialize the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Hyperparameter Tuning Function
def tune_hyperparameters(model, params, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Tune Random Forest
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
best_rf = tune_hyperparameters(models['Random Forest'], rf_params, X_train, y_train)

# Tune Gradient Boosting
gb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_gb = tune_hyperparameters(models['Gradient Boosting'], gb_params, X_train, y_train)

# Tune XGBoost
xgb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_xgb = tune_hyperparameters(models['XGBoost'], xgb_params, X_train, y_train)


In [7]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score

# Stacking Model
stacking_model = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    final_estimator=LogisticRegression()
)

# Evaluation Function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

    # Cross-validation score
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy').mean()

    return accuracy, conf_matrix, class_report, cv_score

# Evaluate Models
models_to_evaluate = {
    'Best Random Forest': best_rf,
    'Best Gradient Boosting': best_gb,
    'Best XGBoost': best_xgb,
    'Stacked Model': stacking_model
}

for model_name, model in models_to_evaluate.items():
    accuracy, conf_matrix, class_report, cv_score = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")
    print(f"Cross-Validation Score: {cv_score}")



Best Random Forest:
Accuracy: 0.7777777777777778
Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [1 1 5]]
Classification Report:
                precision    recall  f1-score   support

  High Traffic       0.50      1.00      0.67         1
   Low Traffic       0.50      1.00      0.67         1
Medium Traffic       1.00      0.71      0.83         7

      accuracy                           0.78         9
     macro avg       0.67      0.90      0.72         9
  weighted avg       0.89      0.78      0.80         9

Cross-Validation Score: 0.6952380952380952

Best Gradient Boosting:
Accuracy: 0.8888888888888888
Confusion Matrix:
[[1 0 0]
 [0 1 0]
 [1 0 6]]
Classification Report:
                precision    recall  f1-score   support

  High Traffic       0.50      1.00      0.67         1
   Low Traffic       1.00      1.00      1.00         1
Medium Traffic       1.00      0.86      0.92         7

      accuracy                           0.89         9
     macro avg       0.83      0.95   

In [None]:
# Save the Stacked Model
joblib.dump(stacking_model, 'traffic_model.pkl')
print("Best model saved as 'best_traffic_model.pkl'")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

# Load and preprocess the dataset
data = pd.read_csv('data/Train.csv')

# Drop columns that are not required
data = data[['date_time', 'is_holiday', 'temperature', 'weekday', 'hour', 'month_day', 'year', 'month', 'traffic_volume']]

# Convert holiday to binary
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Extract date-time features
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['weekday'] = data['date_time'].dt.weekday
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Prepare Features and Target
features = ['is_holiday', 'temperature', 'weekday', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_volume']  # Assuming traffic volume is your target

# Scaling Features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)

# Encoding Target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Initialize the models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}

# Hyperparameter Tuning Function
def tune_hyperparameters(model, params, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Tune Random Forest
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10]}
best_rf = tune_hyperparameters(models['Random Forest'], rf_params, X_train, y_train)

# Tune Gradient Boosting
gb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_gb = tune_hyperparameters(models['Gradient Boosting'], gb_params, X_train, y_train)

# Tune XGBoost
xgb_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
best_xgb = tune_hyperparameters(models['XGBoost'], xgb_params, X_train, y_train)

# Stacking Model
stacking_model = StackingClassifier(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    final_estimator=LogisticRegression()
)

# Evaluation Function
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

    # Cross-validation score
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy').mean()

    return accuracy, conf_matrix, class_report, cv_score

# Evaluate Models
models_to_evaluate = {
    'Best Random Forest': best_rf,
    'Best Gradient Boosting': best_gb,
    'Best XGBoost': best_xgb,
    'Stacked Model': stacking_model
}

for model_name, model in models_to_evaluate.items():
    accuracy, conf_matrix, class_report, cv_score = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f"\n{model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")
    print(f"Cross-Validation Score: {cv_score}")


In [None]:
# reviste
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import joblib
import warnings

warnings.filterwarnings('ignore')

# Step 1: Load the Dataset
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Step 2: Data Preprocessing & Feature Engineering
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

# Convert holiday to binary
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Extract date-time features
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['weekday'] = data['date_time'].dt.day + 1
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year
data.to_csv("traffic_volume_data.csv", index=None)
data = pd.read_csv("traffic_volume_data.csv")

label_columns = ['weather_type', 'weather_description']
numeric_columns = ['is_holiday', 'temperature',
                       'weekday', 'hour', 'month_day', 'year', 'month']


features = numeric_columns+label_columns

# Encode weather conditions
n1features = ['Rain', 'Clouds', 'Clear', 'Snow', 'Mist', 'Drizzle', 'Haze', 'Thunderstorm', 'Fog', 'Smoke', 'Squall']
n2features = ['light rain', 'few clouds', 'Sky is Clear', 'light snow', 'sky is clear', 'mist', 'broken clouds',
              'moderate rain', 'drizzle', 'overcast clouds', 'scattered clouds', 'haze', 'proximity thunderstorm',
              'light intensity drizzle', 'heavy snow', 'heavy intensity rain', 'fog', 'heavy intensity drizzle',
              'shower snow', 'snow', 'thunderstorm with rain', 'thunderstorm with heavy rain',
              'thunderstorm with light rain', 'proximity thunderstorm with rain', 'thunderstorm with drizzle',
              'smoke', 'thunderstorm', 'proximity shower rain', 'very heavy rain', 'proximity thunderstorm with drizzle',
              'light rain and snow', 'light intensity shower rain', 'SQUALLS', 'shower drizzle', 'thunderstorm with light drizzle']

n11 = [n1features.index(wt) + 1 if wt in n1features else 0 for wt in data['weather_type']]
n22 = [n2features.index(wd) + 1 if wd in n2features else 0 for wd in data['weather_description']]

data['weather_type'] = n11
data['weather_description'] = n22


# Define traffic categories
target = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = target[0.25]   # Addtional information why not 0.50 and you only consider the 0.25 and 0.75 pls comment and review the code
high_threshold = target[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Step 3: Prepare Features and Target
features = ['is_holiday', 'temperature', 'weekday', 'hour', 'month_day', 'year', 'month', 'weather_type', 'weather_description']

X = data[features]
y = data['traffic_category']



# Scaling Features
x_scaler = MinMaxScaler()
X_scaled = pd.DataFrame(x_scaler.fit_transform(X), columns=features)

y_scaler = MinMaxScaler()
y = y_scaler.fit_transform(y).flatten()
warnings.filterwarnings('ignore')

label_encoder = LabelEncoder() # Need review between the minmax scaler and label encoder which option much better
y = label_encoder.fit_transform(y)

# Step 4: Split the data into Train and Test Sets   //Further research these area because why x scaled to train slpit en left out y value for the scale one also .. pls i need more insight on these
# or should we use the X,Y OR USE USE ALREADY FIT TRNASFORM TO TRAIN TEST SPLIT OUR MODELS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(features)

# Step 5: Initialize Models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'XGBoost Regressor': XGBRegressor(random_state=42)
}

# Step 6: Hyperparameter Tuning with Grid Search
def tune_hyperparameters(model, params, X_train, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Hyperparameter tuning for RandomForest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}
best_rf = tune_hyperparameters(RandomForestRegressor(random_state=42), rf_params, X_train, y_train)

# Hyperparameter tuning for Gradient Boosting
gb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
best_gb = tune_hyperparameters(GradientBoostingRegressor(random_state=42), gb_params, X_train, y_train)

# Hyperparameter tuning for XGBoost
xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
best_xgb = tune_hyperparameters(XGBRegressor(random_state=42), xgb_params, X_train, y_train)

# Step 7: Model Stacking
stacking_model = StackingRegressor(
    estimators=[('rf', best_rf), ('gb', best_gb), ('xgb', best_xgb)],
    final_estimator=LinearRegression()
)

# Step 8: Model Evaluation
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Cross-validation score
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2').mean()

    return mae, mse, r2, cv_score

# Evaluate all models
model_results = {}
for model_name, model in {'Best Random Forest': best_rf, 'Best Gradient Boosting': best_gb, 'Best XGBoost': best_xgb, 'Stacked Model': stacking_model}.items():
    mae, mse, r2, cv_score = evaluate_model(model, X_train, y_train, X_test, y_test)
    model_results[model_name] = {'MAE': mae, 'MSE': mse, 'R²': r2, 'CV Score': cv_score}
    print(f'\n{model_name}:')
    print(f'MAE: {mae}')
    print(f'MSE: {mse}')
    print(f'R²: {r2}')
    print(f'Cross-Validation Score: {cv_score}')

# Step 9: Select the Best Model
best_model_name = max(model_results, key=lambda x: model_results[x]['R²'])
best_model = {'Best Random Forest': best_rf, 'Best Gradient Boosting': best_gb, 'Best XGBoost': best_xgb, 'Stacked Model': stacking_model}[best_model_name]

print(f"\nBest model is: {best_model_name} with R²={model_results[best_model_name]['R²']}")

# Step 10: Save the Best Model
joblib.dump(best_model, f'best_traffic_model_{best_model_name}.pkl')
print(f"\nBest model {best_model_name} saved as 'best_traffic_model_{best_model_name}.pkl'")

# Step 11: Production-Ready Pipeline Summary
print("\n--- Final Model Summary ---")
for model_name, metrics in model_results.items():
    print(f"\nModel: {model_name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")






import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from imblearn.over_sampling import SMOTE
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import label_binarize

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['day'] = data['date_time'].dt.day + 1
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year


# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X_scaled = pd.DataFrame(x_scaler.fit_transform(X), columns=features)  # Ensure feature names are retained

# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Define model
model = RandomForestClassifier(random_state=1, class_weight='balanced')

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average Cross-Validation Accuracy: {cv_scores.mean()}')

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_resampled)
print(classification_report(y_resampled, y_pred, target_names=label_encoder.classes_))

# Add this to inspect prediction probabilities
prediction_probs = best_model.predict_proba(X_resampled)
print(f'Prediction Probabilities: {prediction_probs[:5]}')  # Displaying first 5 for brevity

# Binarize the output labels for multiclass ROC
y_bin = label_binarize(y_resampled, classes=label_encoder.classes_)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(y_bin.shape[1]):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], prediction_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute macro-average ROC curve and ROC area
y_bin_ravel = y_bin.ravel()
prediction_probs_ravel = prediction_probs.ravel()
if np.sum(y_bin_ravel) > 0:  # Avoid division by zero or empty array issues
    fpr["macro"], tpr["macro"], _ = roc_curve(y_bin_ravel, prediction_probs_ravel)
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
else:
    fpr["macro"], tpr["macro"], roc_auc["macro"] = [], [], np.nan

# Print ROC Curve Values
print('ROC Curve Values:')
for i in range(y_bin.shape[1]):
    if np.any(~np.isnan(fpr[i])) and np.any(~np.isnan(tpr[i])):
        print(f'Class {label_encoder.classes_[i]} - FPR: {fpr[i]}')
        print(f'Class {label_encoder.classes_[i]} - TPR: {tpr[i]}')
        print(f'Class {label_encoder.classes_[i]} - AUC: {roc_auc[i]}')
    else:
        print(f'Class {label_encoder.classes_[i]} - No valid ROC data')

print(f'Macro-average ROC AUC: {roc_auc["macro"]}')

# Plot ROC Curves
plt.figure(figsize=(10, 8))
colors = ['blue', 'green', 'red']
for i, color in zip(range(y_bin.shape[1]), colors):
    if np.any(~np.isnan(fpr[i])) and np.any(~np.isnan(tpr[i])):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {label_encoder.classes_[i]} (area = {roc_auc[i]:.2f})')
plt.plot(fpr["macro"], tpr["macro"], color='gray', linestyle='--',
         label=f'Macro-average ROC curve (area = {roc_auc["macro"]:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Compute and display confusion matrix
conf_matrix = confusion_matrix(y_resampled, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Compute and display precision-recall curves
plt.figure(figsize=(10, 8))
for i in range(y_bin.shape[1]):
    precision, recall, _ = precision_recall_curve(y_bin[:, i], prediction_probs[:, i])
    plt.plot(recall, precision, label=f'Precision-Recall curve of class {label_encoder.classes_[i]}')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.show()

# Save the model and scalers
joblib.dump(best_model, 'classifier.pkl')
joblib.dump(x_scaler, 'scale.pkl')
joblib.dump(label_encoder, 'coders.pkl')
print("Model and scalers saved.")



import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, roc_curve, auc
from imblearn.over_sampling import SMOTE
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import label_binarize

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['day'] = data['date_time'].dt.day + 1  # why adding plus one value
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X_scaled = x_scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)  # Ensure feature names are consistent

# Handle class imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Define model
model = RandomForestClassifier(random_state=1, class_weight='balanced')

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
cv_scores = cross_val_score(model, X_resampled, y_resampled, cv=cv, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average Cross-Validation Accuracy: {cv_scores.mean()}')

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X_resampled, y_resampled)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_resampled)
print(classification_report(y_resampled, y_pred, target_names=label_encoder.classes_))

# Add this to inspect prediction probabilities
prediction_probs = best_model.predict_proba(X_resampled)
print(f'Prediction Probabilities: {prediction_probs}')

# Binarize the output labels for multiclass ROC
y_bin = label_binarize(y_resampled, classes=label_encoder.classes_)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(y_bin.shape[1]):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], prediction_probs[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute macro-average ROC curve and ROC area
y_bin_ravel = y_bin.ravel()
prediction_probs_ravel = prediction_probs.ravel()
if np.sum(y_bin_ravel) > 0:  # Avoid division by zero or empty array issues
    fpr["macro"], tpr["macro"], _ = roc_curve(y_bin_ravel, prediction_probs_ravel)
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
else:
    fpr["macro"], tpr["macro"], roc_auc["macro"] = [], [], np.nan

# Print ROC Curve Values
print('ROC Curve Values:')
for i in range(y_bin.shape[1]):
    if np.any(~np.isnan(fpr[i])) and np.any(~np.isnan(tpr[i])):
        print(f'Class {label_encoder.classes_[i]} - FPR: {fpr[i]}')
        print(f'Class {label_encoder.classes_[i]} - TPR: {tpr[i]}')
        print(f'Class {label_encoder.classes_[i]} - AUC: {roc_auc[i]}')
    else:
        print(f'Class {label_encoder.classes_[i]} - No valid ROC data')

print(f'Macro-average ROC AUC: {roc_auc["macro"]}')

# Plot ROC Curves
plt.figure(figsize=(10, 8))
colors = ['blue', 'green', 'red']
for i, color in zip(range(y_bin.shape[1]), colors):
    if np.any(~np.isnan(fpr[i])) and np.any(~np.isnan(tpr[i])):
        plt.plot(fpr[i], tpr[i], color=color, lw=2,
                 label=f'ROC curve of class {label_encoder.classes_[i]} (area = {roc_auc[i]:.2f})')
plt.plot(fpr["macro"], tpr["macro"], color='gray', linestyle='--',
         label=f'Macro-average ROC curve (area = {roc_auc["macro"]:.2f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# Save the model and scalers
joblib.dump(best_model, 'classifier.pkl')
joblib.dump(x_scaler, 'scale.pkl')
joblib.dump(label_encoder, 'coders.pkl')
print("Model and scalers saved.")



import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')

# EDA
print(data.head())
print(data.describe())
print(data.info())
print(data['traffic_volume'].value_counts())

# Feature Engineering
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['day'] = data['date_time'].dt.day
data['month_day'] = data['date_time'].dt.day
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Adding lag features for traffic volume
for n in [1, 2, 3, 4, 5, 6]:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_treshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
model = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=label_encoder.transform(['No Traffic', 'Low Traffic', 'Medium Traffic', 'High Traffic']))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Save the model and scalers
#joblib.dump(best_model, 'traffics_classifier_model.pkl')
#joblib.dump(x_scaler, 'x_scalers.pkl')
#joblib.dump(label_encoder, 'label_encoders.pkl')
#print("Model and scalers saved.")


import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Feature Engineering
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['day'] = data['date_time'].dt.day
data['month_day'] = data['date_time'].dt.day
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Adding lag features for traffic volume
for n in [1, 2, 3, 4, 5, 6]:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

#Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['day'] = data['date_time'].dt.day + 1
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)

# Handle class imbalance
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Define model
model = RandomForestClassifier(random_state=1, class_weight='balanced')

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average Cross-Validation Accuracy: {cv_scores.mean()}')

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X)
print(classification_report(y, y_pred, target_names=label_encoder.classes_))

# Add this to inspect prediction probabilities
prediction_probs = best_model.predict_proba(X)
print(f'Prediction Probabilities: {prediction_probs}')

# Check the distribution after applying SMOTE
unique, counts = np.unique(y, return_counts=True)
print(f'Class distribution after SMOTE: {dict(zip(unique, counts))}')

# Confusion Matrix
cm = confusion_matrix(y, y_pred)
print(f'Confusion Matrix:\n{cm}')

# Print confusion matrix as a DataFrame for better readability
cm_df = pd.DataFrame(cm, index=label_encoder.classes_, columns=label_encoder.classes_)
print(f'Confusion Matrix DataFrame:\n{cm_df}')

# Plotting the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Save the model and scalers
joblib.dump(best_model, 'classifier_model.pkl')
joblib.dump(x_scaler, 'scalers.pkl')
joblib.dump(label_encoder, 'encoders.pkl')
print("Model and scalers saved.")

import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')

# EDA
print(data.head())
print(data.describe())
print(data.info())
print(data['traffic_volume'].value_counts())

# Feature Engineering
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['day'] = data['date_time'].dt.day
data['month_day'] = data['date_time'].dt.day
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)

# Adding lag features for traffic volume
for n in [1, 2, 3, 4, 5, 6]:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)

# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

model = RandomForestClassifier(random_state=1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test)

# Print classification report with correct target names
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=label_encoder.transform(target_names))
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Save the model and scalers
#joblib.dump(best_model, 'traffics_classifier_model.pkl')
#joblib.dump(x_scaler, 'x_scalers.pkl')
#joblib.dump(label_encoder, 'label_encoders.pkl')
#rint("Model and scalers saved.")

import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, make_scorer, confusion_matrix
from imblearn.over_sampling import SMOTE
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

# Load and preprocess data
data = pd.read_csv('data/Train.csv')
data = data.sort_values(by=['date_time'], ascending=True).reset_index(drop=True)

# Feature Engineering
last_n_hours = [1, 2, 3, 4, 5, 6]
for n in last_n_hours:
    data[f'last_{n}_hour_traffic'] = data['traffic_volume'].shift(n)
data = data.dropna().reset_index(drop=True)
data['is_holiday'] = data['is_holiday'].apply(lambda x: 1 if x != 'None' else 0)
data['date_time'] = pd.to_datetime(data['date_time'])
data['hour'] = data['date_time'].dt.hour
data['month_day'] = data['date_time'].dt.day
data['day'] = data['date_time'].dt.day + 1
data['month'] = data['date_time'].dt.month
data['year'] = data['date_time'].dt.year

# Define traffic categories
percentiles = data['traffic_volume'].quantile([0.25, 0.75]).to_dict()
low_threshold = percentiles[0.25]
high_threshold = percentiles[0.75]

def categorize_traffic(volume):
    if volume == 0:
        return 'No Traffic'
    elif volume <= low_threshold:
        return 'Low Traffic'
    elif volume <= high_threshold:
        return 'Medium Traffic'
    else:
        return 'High Traffic'

data['traffic_category'] = data['traffic_volume'].apply(categorize_traffic)

# Prepare dataset
features = ['is_holiday', 'temperature', 'day', 'hour', 'month_day', 'year', 'month']
X = data[features]
y = data['traffic_category']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Scale features
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X)

# Handle class imbalance
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Define model
model = RandomForestClassifier(random_state=1, class_weight='balanced')

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Average Cross-Validation Accuracy: {cv_scores.mean()}')

# Define and train the model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy')
grid_search.fit(X, y)

best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X)
print(classification_report(y, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
