# 📊 EDA לתחזית נציגים

## טעינת ספריות בסיס

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten , Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from itertools import product
from IPython.display import clear_output
import joblib
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
from sklearn.model_selection import TimeSeriesSplit

## קריאת קבצי CSV

In [2]:
cc_df = pd.read_csv("CC_2020-2025_New.csv")
holidays_df = pd.read_csv("Holidays_New.csv")

##  סקירה ראשונית

In [3]:
print("--- CC Data Info ---")
cc_df.info()
print("\n--- Holidays Data Info ---")
holidays_df.info()

--- CC Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34088 entries, 0 to 34087
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   QueueStartDate        34088 non-null  object
 1   QueueStartDateNumber  34088 non-null  int64 
 2   QueueStartDateName    34088 non-null  object
 3   HourInterval          34088 non-null  object
 4   HalfHourInterval      34088 non-null  object
 5   TotalCallsOffered     34088 non-null  int64 
 6   TotalCallsAnswered    34088 non-null  int64 
 7   TotalCallsAbandoned   34088 non-null  int64 
 8   TotalCB               34088 non-null  int64 
 9   TotalTransfered       34088 non-null  int64 
 10  TotalWaitDuration     34088 non-null  int64 
 11  TotalAgents           34088 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 3.1+ MB

--- Holidays Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 

## שינוי שמות עמודות (לשמות באנגלית אחידים)


In [4]:
cc_df.columns = [col.strip().replace(" ", "_").replace("-", "_") for col in cc_df.columns]
holidays_df.columns = [col.strip().replace(" ", "_").replace("-", "_") for col in holidays_df.columns]

## המרת עמודת תאריך לפורמט datetime

In [5]:
cc_df['QueueStartDate'] = pd.to_datetime(cc_df['QueueStartDate'], dayfirst=True, errors='coerce')
holidays_df['Date'] = pd.to_datetime(holidays_df['Date'], dayfirst=True, errors='coerce')

## יצירת עמודות חדשות

In [6]:
cc_df['Weekday'] = cc_df['QueueStartDate'].dt.day_name()
cc_df['IsWeekend'] = cc_df['Weekday'].isin(['Friday', 'Saturday'])
cc_df['AnsweredRatio'] = (cc_df['TotalCallsAnswered'] / cc_df['TotalAgents'])
cc_df['AnsweredRatio'] = cc_df['AnsweredRatio'].replace([np.inf, -np.inf], np.nan).fillna(0).round().astype(int)

## מיזוג מול טבלת חגים

In [7]:
cc_df = cc_df.merge(holidays_df[['Date', 'HolidayName', 'IsHoliday','IsHolidayEve']],
                    left_on='QueueStartDate', right_on='Date', how='left')

# ✅ יצירת פיצ'רים בינאריים
cc_df['IsHoliday'] = cc_df['IsHoliday'].fillna(False)
cc_df['IsHolidayEve'] = cc_df['IsHolidayEve'].fillna(False)

# ✅ הסרת עמודות מיותרות
cc_df.drop(columns=['Date', 'HolidayName'], inplace=True)

In [8]:
cc_df

Unnamed: 0,QueueStartDate,QueueStartDateNumber,QueueStartDateName,HourInterval,HalfHourInterval,TotalCallsOffered,TotalCallsAnswered,TotalCallsAbandoned,TotalCB,TotalTransfered,TotalWaitDuration,TotalAgents,Weekday,IsWeekend,AnsweredRatio,IsHoliday,IsHolidayEve
0,2020-01-01,4,Wednesday,07:00 - 08:00,07:30 - 08:00,27,25,1,0,0,755,8,Wednesday,False,3,False,False
1,2020-01-01,4,Wednesday,08:00 - 09:00,08:00 - 08:30,351,234,96,21,0,124562,35,Wednesday,False,7,False,False
2,2020-01-01,4,Wednesday,08:00 - 09:00,08:30 - 09:00,388,241,86,61,0,130078,42,Wednesday,False,6,False,False
3,2020-01-01,4,Wednesday,09:00 - 10:00,09:00 - 09:30,446,245,118,82,0,179730,48,Wednesday,False,5,False,False
4,2020-01-01,4,Wednesday,09:00 - 10:00,09:30 - 10:00,448,259,97,90,0,188894,50,Wednesday,False,5,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34083,2025-06-30,2,Monday,16:00 - 17:00,16:30 - 17:00,211,149,62,0,0,29279,22,Monday,False,7,False,False
34084,2025-06-30,2,Monday,17:00 - 18:00,17:00 - 17:30,26,23,3,0,0,5970,6,Monday,False,4,False,False
34085,2025-06-30,2,Monday,17:00 - 18:00,17:30 - 18:00,26,22,4,0,0,4299,5,Monday,False,4,False,False
34086,2025-06-30,2,Monday,18:00 - 19:00,18:00 - 18:30,8,7,1,0,0,2071,3,Monday,False,2,False,False


##  טיפול בערכים חסרים


In [9]:
print("\nMissing values per column:")
print(cc_df.isna().sum())



Missing values per column:
QueueStartDate          0
QueueStartDateNumber    0
QueueStartDateName      0
HourInterval            0
HalfHourInterval        0
TotalCallsOffered       0
TotalCallsAnswered      0
TotalCallsAbandoned     0
TotalCB                 0
TotalTransfered         0
TotalWaitDuration       0
TotalAgents             0
Weekday                 0
IsWeekend               0
AnsweredRatio           0
IsHoliday               0
IsHolidayEve            0
dtype: int64


## הסרת כפילויות


In [10]:
original_len = len(cc_df)
cc_df.drop_duplicates(subset=['QueueStartDate', 'HalfHourInterval'], inplace=True)
print(f"🧹 נמחקו {original_len - len(cc_df)} כפילויות לפי QueueStartDate ו-HalfHourInterval")

🧹 נמחקו 0 כפילויות לפי QueueStartDate ו-HalfHourInterval


## תיאור סטטיסטי בסיסי

In [11]:
print("\nDescriptive stats:")
print(cc_df.describe())


Descriptive stats:
                      QueueStartDate  QueueStartDateNumber  TotalCallsOffered  \
count                          34088          34088.000000       34088.000000   
mean   2022-10-09 00:48:50.016428032              3.258067         426.620952   
min              2020-01-01 00:00:00              1.000000           1.000000   
25%              2021-05-30 00:00:00              2.000000         170.000000   
50%              2022-10-13 00:00:00              3.000000         443.000000   
75%              2024-02-23 00:00:00              5.000000         633.000000   
max              2025-06-30 00:00:00              7.000000        1684.000000   
std                              NaN              1.598184         299.337418   

       TotalCallsAnswered  TotalCallsAbandoned       TotalCB  TotalTransfered  \
count        34088.000000         34088.000000  34088.000000     34088.000000   
mean           266.687163            99.659763     58.562632         1.557381   
min    

## גרפים בסיסיים

In [12]:
fig1 = px.histogram(cc_df, x='Weekday', category_orders={'Weekday': ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']}, title='<b>מספר רשומות לפי יום בשבוע</b>')
fig1.update_layout(title={'x':0.5})
fig1.show()

fig2 = px.box(cc_df, x='HourInterval', y='TotalAgents', title='<b>התפלגות מספר נציגים לפי אינטרוול שעתי</b>')
fig2.update_layout(title={'x':0.5})
fig2.show()


## המרת אינטרוול לפורמט מספרי: '10:00 - 11:00' -> 11


In [13]:
# def parse_interval(interval_str):
#     start_time = interval_str.split(" - ")[0]
#     hour, minute = map(int, start_time.split(":"))
#     return hour + (0.5 if minute == 30 else 0.0)
def convert_hourinterval_to_int(hour_interval: str):
    """
    ממיר אינטרוול טקסטואלי בסגנון '07:00 - 08:00' למספר שלם:
    - לוקח את השעה הסופית (08:00 → 8)
    - במעבר חצות (00:00) מחזיר 24
    """
    try:
        end_time = hour_interval.split('-')[1].strip()
        hour = int(end_time.split(':')[0])
        return 24 if hour == 0 else hour
    except:
        return np.nan

In [14]:
cc_df['Interval'] = cc_df['HourInterval'].apply(convert_hourinterval_to_int)
cc_df['Interval'] = cc_df['Interval'].astype('Int64')

In [15]:
cc_df['HourInterval'].unique()

array(['07:00 - 08:00', '08:00 - 09:00', '09:00 - 10:00', '10:00 - 11:00',
       '11:00 - 12:00', '12:00 - 13:00', '13:00 - 14:00', '14:00 - 15:00',
       '15:00 - 16:00', '16:00 - 17:00', '17:00 - 18:00', '18:00 - 19:00',
       '19:00 - 20:00', '06:00 - 07:00', '22:00 - 23:00', '23:00 - 00:00',
       '20:00 - 21:00', '21:00 - 22:00', '00:00 - 01:00', '01:00 - 02:00',
       '04:00 - 05:00', '05:00 - 06:00'], dtype=object)

In [16]:
cc_df['Interval'].unique()

<IntegerArray>
[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 7, 23, 24, 21, 22, 1, 2, 5,
 6]
Length: 22, dtype: Int64

In [17]:
answer_ratio_lookup = (
    cc_df
    .groupby(['Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve'])['AnsweredRatio']
    .mean()
    .round()                                # ✅ עיגול למספר השלם הקרוב
    .astype(int)                            # ✅ המרה למספר שלם
    .reset_index()
)
answer_ratio_lookup.to_csv("/content/answer_ratio_lookup.csv", index=False)
answer_ratio_lookup[
        (answer_ratio_lookup['Weekday'] == 'Monday') &
        (answer_ratio_lookup['Interval'] == 10) &
        (answer_ratio_lookup['IsHoliday'] == False) &
        (answer_ratio_lookup['IsHolidayEve'] == False)
    ]

Unnamed: 0,Weekday,Interval,IsHoliday,IsHolidayEve,AnsweredRatio
31,Monday,10,False,False,6


In [18]:
df = cc_df[['QueueStartDate','Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve', 'AnsweredRatio', 'TotalAgents']]


In [19]:
df.head()

Unnamed: 0,QueueStartDate,Weekday,Interval,IsHoliday,IsHolidayEve,AnsweredRatio,TotalAgents
0,2020-01-01,Wednesday,8,False,False,3,8
1,2020-01-01,Wednesday,9,False,False,7,35
2,2020-01-01,Wednesday,9,False,False,6,42
3,2020-01-01,Wednesday,10,False,False,5,48
4,2020-01-01,Wednesday,10,False,False,5,50


In [20]:
df['TotalAgents'] = df['TotalAgents'].round().astype(int)

# 🔹 כמות נציגים לפי אינטרוול (Box Plot)
fig1 = px.box(
    df,
    x='Interval',
    y='TotalAgents',
    points='all',
    title="<b>כמות נציגים לפי אינטרוול</b>"
)
fig1.show()

# 🔹 כמות נציגים לפי יום בשבוע (Bar Plot)
order_days = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']

mean_agents_by_day = (
    df.groupby('Weekday', as_index=False)['TotalAgents']
      .mean()
      .round()                 # ✅ עיגול למספר שלם
      .astype({'TotalAgents': int})
)

fig2 = px.bar(
    mean_agents_by_day.sort_values(by='Weekday'),
    x='Weekday',
    y='TotalAgents',
    title="<b>ממוצע כמות נציגים לפי יום בשבוע</b>",
    text_auto=True,
    category_orders={'Weekday': order_days}
)
fig2.update_traces(texttemplate='%{y}', textposition='outside')  # ✅ הצגת מספר שלם
fig2.show()

# 🔹 טרנד יומי – כמות נציגים לאורך זמן (Line Plot)
daily_agents = (
    df.groupby('QueueStartDate', as_index=False)['TotalAgents']
      .mean()
      .round()
      .astype({'TotalAgents': int})
)

fig3 = px.line(
    daily_agents,
    x='QueueStartDate',
    y='TotalAgents',
    title="<b>מגמת כמות נציגים לאורך זמן</b>"
)
fig3.show()


In [21]:
df.to_csv("final_dataset.csv", index=False)
print("\n✅ קובץ final_dataset.csv נשמר בהצלחה!")
print(df.head())


✅ קובץ final_dataset.csv נשמר בהצלחה!
  QueueStartDate    Weekday  Interval IsHoliday IsHolidayEve  AnsweredRatio  \
0     2020-01-01  Wednesday         8     False        False              3   
1     2020-01-01  Wednesday         9     False        False              7   
2     2020-01-01  Wednesday         9     False        False              6   
3     2020-01-01  Wednesday        10     False        False              5   
4     2020-01-01  Wednesday        10     False        False              5   

   TotalAgents  
0            8  
1           35  
2           42  
3           48  
4           50  


# בחירת מודל אופטימלי לחיזוי

## בחירת משתנים לחיזוי


In [22]:
X = df[['QueueStartDate','Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve']]
y = df['TotalAgents']

## קידוד וטרנספורמציה

In [23]:
# הגדרת cross-validation עם סדר כרונולוגי
tscv = TimeSeriesSplit(n_splits=5)

In [24]:
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), ['Weekday']),
    ("num", "passthrough", ['Interval', 'IsHoliday', 'IsHolidayEve'])
])

##  חלוקה ל-Train/Validation/Test

In [25]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [26]:
def evaluate_model(name, y_true, y_pred):
    return {
        "Model": name,
        "MAE": mean_absolute_error(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "R2": r2_score(y_true, y_pred)
    }

## יצירת מודלים להשוואה

In [27]:
# print("\n=== Training Random Forest ===")
# preprocessor_rf = ColumnTransformer([
#     ("cat", OneHotEncoder(handle_unknown='ignore'), ['Weekday']),
#     ("num", "passthrough", ['Interval', 'IsHoliday', 'IsHolidayEve'])
# ])
# rf_model = Pipeline([
#     ('preprocessor', preprocessor_rf),
#     ('model', RandomForestRegressor(random_state=42))
# ])
# rf_model.fit(X_train, y_train)
# rf_pred = rf_model.predict(X_test)
# rf_results = evaluate_model("Random Forest", y_test, rf_pred)
# print(rf_results)

In [28]:
# print("\n=== Training XGBoost ===")
# xgb_model = Pipeline([
#     ('preprocessor', preprocessor_rf),
#     ('model', XGBRegressor(random_state=42))
# ])
# xgb_model.fit(X_train, y_train)
# xgb_pred = xgb_model.predict(X_test)
# xgb_results = evaluate_model("XGBoost", y_test, xgb_pred)
# print(xgb_results)


In [29]:
# print("\n=== Training Neural Network ===")
# pre_nn = ColumnTransformer([
#     ("cat", OneHotEncoder(handle_unknown='ignore'), ['Weekday']),
#     ("num", StandardScaler(), ['Interval', 'IsHoliday', 'IsHolidayEve'])
# ])
# X_train_nn = pre_nn.fit_transform(X_train)
# X_test_nn = pre_nn.transform(X_test)

# # ארכיטקטורה משופרת
# nn_model = Sequential([
#     Dense(128, activation='relu', input_shape=(X_train_nn.shape[1],)),
#     BatchNormalization(),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     BatchNormalization(),
#     Dropout(0.2),
#     Dense(32, activation='relu'),
#     Dense(1)
# ])

# nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# nn_model.fit(
#     X_train_nn, y_train,
#     epochs=100, batch_size=64, verbose=0,
#     callbacks=[early_stop]
# )

# nn_pred = nn_model.predict(X_test_nn).flatten()
# nn_results = evaluate_model("Neural Network", y_test, nn_pred)
# print(nn_results)


In [30]:
# print("\n=== Training CNN (Temporal) ===")
# n_lags = 12

# X_seq, y_seq = [], []
# for i in range(n_lags, len(X_full)):
#     X_seq.append(X_full[i-n_lags:i])
#     y_seq.append(y_full[i])

# X_seq, y_seq = np.array(X_seq), np.array(y_seq)
# train_size = int(len(X_seq) * 0.8)
# X_train_cnn, X_test_cnn = X_seq[:train_size], X_seq[train_size:]
# y_train_cnn, y_test_cnn = y_seq[:train_size], y_seq[train_size:]

# cnn_model = Sequential([
#     Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], X_train_cnn.shape[2])),
#     BatchNormalization(),
#     Dropout(0.2),
#     Conv1D(32, kernel_size=2, activation='relu'),
#     BatchNormalization(),
#     Flatten(),
#     Dense(64, activation='relu'),
#     Dropout(0.3),
#     Dense(1)
# ])

# cnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='mse')

# # EarlyStopping למניעת Overfitting
# early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)

# cnn_model.fit(
#     X_train_cnn, y_train_cnn,
#     epochs=100, batch_size=64, verbose=0,
#     callbacks=[early_stop]
# )

# cnn_pred = cnn_model.predict(X_test_cnn).flatten()
# cnn_results = evaluate_model("CNN", y_test_cnn, cnn_pred)
# print(cnn_results)

In [31]:
# print("\n=== Training Prophet ===")
# prophet_models = {}
# prophet_preds, prophet_true = [], []
# for interval in X_test['Interval'].unique():
#     df_int = df[df['Interval'] == interval][['QueueStartDate', 'TotalAgents']].copy()
#     if df_int.empty:
#         continue
#     df_int = df_int.rename(columns={'QueueStartDate': 'ds', 'TotalAgents': 'y'})
#     if df_int['y'].nunique() < 2:
#         continue
#     train_df = df_int[df_int['ds'] < X_test['QueueStartDate'].max()]
#     future_df = df_int[df_int['ds'].isin(X_test['QueueStartDate'])]
#     if future_df.empty:
#         continue
#     m = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
#     m.fit(train_df)
#     forecast = m.predict(future_df[['ds']])
#     prophet_preds.extend(forecast['yhat'].values)
#     prophet_true.extend(future_df['y'].values)

#     prophet_models[interval] = m

# if len(prophet_preds) > 0:
#     prophet_results = {
#         "Model": "Prophet",
#         "MAE": mean_absolute_error(prophet_true, prophet_preds),
#         "RMSE": np.sqrt(mean_squared_error(prophet_true, prophet_preds)),
#         "R2": r2_score(prophet_true, prophet_preds)
#     }
# else:
#     prophet_results = {"Model": "Prophet", "MAE": np.nan, "RMSE": np.nan, "R2": np.nan}
# print(prophet_results)

In [51]:
def extract_prophet_config(model):
    return {
        "growth": model.growth,
        "yearly_seasonality": model.yearly_seasonality,
        "weekly_seasonality": model.weekly_seasonality,
        "daily_seasonality": model.daily_seasonality,
        "changepoint_prior_scale": model.changepoint_prior_scale,
        "holidays_prior_scale": model.holidays_prior_scale,
        "seasonality_mode": model.seasonality_mode,
        "seasonality_prior_scale": model.seasonality_prior_scale,
        "n_changepoints": model.n_changepoints,
        "added_seasonalities": list(model.seasonalities.keys()),
        "has_holidays": model.holidays is not None
    }



In [52]:
# שינוי שמות לפורמט Prophet
holidays_df = holidays_df.rename(columns={'Date': 'ds', 'HolidayName': 'holiday'})
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'], errors='coerce')

holidays_df = holidays_df.dropna(subset=['ds', 'holiday'])

# ניתן להוסיף חלון השפעה (ימים לפני/אחרי החג)
holidays_df['lower_window'] = 0
holidays_df['upper_window'] = 0

In [64]:
print("\n=== Training Prophet ===")
prophet_models = {}
prophet_preds_all, prophet_true_all = [], []
results = []

for interval in X_test['Interval'].unique():
    df_int = df[df['Interval'] == interval][['QueueStartDate', 'TotalAgents']].copy()
    if df_int.empty or df_int['TotalAgents'].nunique() < 2:
        continue

    df_int = df_int.rename(columns={'QueueStartDate': 'ds', 'TotalAgents': 'y'})
    train_df = df_int[df_int['ds'] < X_test['QueueStartDate'].max()]
    future_df = df_int[df_int['ds'].isin(X_test['QueueStartDate'])]
    if future_df.empty:
        continue

    # ✅ מודל Prophet עם חגים
    m = Prophet(
        daily_seasonality=True,
        weekly_seasonality=True,
        yearly_seasonality=True,
        seasonality_mode='multiplicative',
        changepoint_prior_scale=0.1,
        holidays_prior_scale=10,
        holidays=holidays_df
    )
    m.fit(train_df)
    forecast = m.predict(future_df[['ds']])

    # ✅ מדדי ביצועים לכל אינטרוול
    mae = mean_absolute_error(future_df['y'], forecast['yhat'])
    rmse = np.sqrt(mean_squared_error(future_df['y'], forecast['yhat']))
    r2 = r2_score(future_df['y'], forecast['yhat'])

    config = extract_prophet_config(m)
    results.append({
        "Interval": interval,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        # "growth": config["growth"],
        # "yearly_seasonality": config["yearly_seasonality"],
        # "weekly_seasonality": config["weekly_seasonality"],
        # "daily_seasonality": config["daily_seasonality"],
        # "changepoint_prior_scale": config["changepoint_prior_scale"],
        # "holidays_prior_scale": config["holidays_prior_scale"],
        # "seasonality_mode": config["seasonality_mode"],
        "n_changepoints": config["n_changepoints"],
        # "added_seasonalities": config["added_seasonalities"],
        # "has_holidays": config["has_holidays"]
    })


    prophet_preds_all.extend(forecast['yhat'].values)
    prophet_true_all.extend(future_df['y'].values)
    prophet_models[interval] = m



DEBUG:cmdstanpy:input tempfile: /tmp/tmpq1047m6q/11_zcsjb.json



=== Training Prophet ===


DEBUG:cmdstanpy:input tempfile: /tmp/tmpq1047m6q/id2hi6w4.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=70180', 'data', 'file=/tmp/tmpq1047m6q/11_zcsjb.json', 'init=/tmp/tmpq1047m6q/id2hi6w4.json', 'output', 'file=/tmp/tmpq1047m6q/prophet_modelmtjyoo1m/prophet_model-20250716112909.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
11:29:09 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
11:29:10 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq1047m6q/c60bntkr.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq1047m6q/ewuyxdl5.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.b

In [65]:
# ✅ יצירת טבלת תוצאות לכל אינטרוול
results_df = pd.DataFrame(results)
print("\n=== תוצאות Prophet עם חגים לכל אינטרוול ===")
print(results_df.sort_values(by="Interval"))

# ✅ חישוב תוצאות סופיות על כלל האינטרוולים
final_results = {
    "Model": "Prophet + Holidays (All Intervals)",
    "MAE": mean_absolute_error(prophet_true_all, prophet_preds_all),
    "RMSE": np.sqrt(mean_squared_error(prophet_true_all, prophet_preds_all)),
    "R2": r2_score(prophet_true_all, prophet_preds_all)
}

print("\n=== תוצאות סופיות על כלל האינטרוולים ===")
for k, v in final_results.items():
    print(f"{k}: {v}")


=== תוצאות Prophet עם חגים לכל אינטרוול ===
    Interval       MAE       RMSE        R2  growth  n_changepoints
17         1  0.000003   0.000004  1.000000  linear               4
15         7  0.050384   0.158115  0.966216  linear              16
0          8  1.584068   2.335571  0.291025  linear              25
8          9  7.932586  12.570793  0.716511  linear              25
7         10  7.177583  12.392878  0.782905  linear              25
1         11  6.959196  11.696089  0.796159  linear              25
5         12  6.740170  11.514755  0.799865  linear              25
10        13  6.586254  11.045271  0.850570  linear              25
6         14  6.224707  10.241731  0.668308  linear              25
2         15  6.794175  10.229691  0.602452  linear              25
4         16  6.744399  10.414359  0.398048  linear              25
11        17  4.743231   6.609499  0.425446  linear              25
9         18  1.359466   2.075973  0.671546  linear              25
3  

In [35]:
# sarima_models = {}
# sarima_preds, sarima_true = [], []
# for interval in X_test['Interval'].unique():
#     df_int = df[df['Interval'] == interval].sort_values('QueueStartDate')
#     y_int = df_int['TotalAgents'].values
#     if len(y_int) < 20:
#         continue

#     train_len = int(len(y_int) * 0.8)
#     train_y, test_y = y_int[:train_len], y_int[train_len:]
#     if len(test_y) < 3:
#         continue

#     try:
#         # פרמטרים בסיסיים (אפשר לכוונן)
#         model = SARIMAX(train_y, order=(1,1,1), seasonal_order=(1,1,1,7))
#         res = model.fit(disp=False)
#         forecast = res.forecast(steps=len(test_y))
#         sarima_preds.extend(forecast)
#         sarima_true.extend(test_y)

#         sarima_models[interval] = res
#     except Exception as e:
#         print(f"Interval {interval} skipped due to: {e}")
#         continue

# if len(sarima_preds) > 0:
#     sarima_results = {
#         "Model": "SARIMA",
#         "MAE": mean_absolute_error(sarima_true, sarima_preds),
#         "RMSE": np.sqrt(mean_squared_error(sarima_true, sarima_preds)),
#         "R2": r2_score(sarima_true, sarima_preds)
#     }
# else:
#     sarima_results = {"Model": "SARIMA", "MAE": np.nan, "RMSE": np.nan, "R2": np.nan}

# print(sarima_results)

In [66]:
# results_df = pd.DataFrame([
#     rf_results, xgb_results, nn_results, cnn_results, prophet_results, sarima_results
# ])
# print("\n=== Final Model Comparison ===")
# print(results_df)
results_df = pd.DataFrame([
     prophet_results
])
print("\n=== Final Model Comparison ===")
print(results_df)


=== Final Model Comparison ===
     Model       MAE      RMSE        R2
0  Prophet  5.653825  9.901888  0.902802


In [56]:
best_model_name = results_df.sort_values("RMSE").iloc[0]["Model"]
print(f"\nBest Model: {best_model_name}")


Best Model: Prophet


### גרף השוואה של ביצועי המודלים

In [57]:
fig_perf = px.bar(results_df, x='Model', y='RMSE', text='RMSE', title='<b>השוואת RMSE בין המודלים</b>')
fig_perf.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig_perf.update_layout(title={'x': 0.5}, yaxis_title='RMSE', xaxis_title='Model')
fig_perf.show()


## שמירת המודל הטוב ביותר


In [58]:
import os
import glob
model_files = glob.glob('/content/*.pkl') + glob.glob('/content/*.keras') + glob.glob('/content/*.h5')
for f in model_files:
    try:
        os.remove(f)
        print(f"🗑️ נמחק: {f}")
    except Exception as e:
        print(f"⚠️ שגיאה במחיקת {f}: {e}")

🗑️ נמחק: /content/best_model_prophet.pkl


In [59]:
best_model_row = results_df.sort_values(by='RMSE').iloc[0]
best_model_name = best_model_row['Model']
print(f"\n✅ המודל האופטימלי לפי RMSE הוא: {best_model_name}")

saved_file = None

if best_model_name == "Random Forest":
    saved_file = "best_model_rf.pkl"
    joblib.dump(rf_model, saved_file)
elif best_model_name == "XGBoost":
    saved_file = "best_model_xgb.pkl"
    joblib.dump(xgb_model, saved_file)
elif best_model_name == "Improved Neural Network":
    saved_file = "best_model_nn.h5"
    nn_model.save(saved_file)
elif best_model_name == "Improved CNN":
    saved_file = "best_model_cnn.h5"
    cnn_model.save(saved_file)
elif best_model_name == "Prophet":
    saved_file = "best_model_prophet.pkl"
    joblib.dump(prophet_models, saved_file)
elif best_model_name == "SARIMA":
    saved_file = "best_model_sarima.pkl"
    joblib.dump(sarima_models, saved_file)

if saved_file:
    print(f"📦 Model saved successfully as: {saved_file}")
else:
    print("⚠ No model saved (no best model found)")


✅ המודל האופטימלי לפי RMSE הוא: Prophet
📦 Model saved successfully as: best_model_prophet.pkl


# טעינת המודל מהקובץ

In [60]:
import os
from tensorflow.keras.models import load_model
import joblib

def load_best_model():
    """
    מזהה ומטעין את המודל הטוב ביותר מהקובץ השמור.
    מחזיר: (model, model_type)
    """
    files_priority = [
        "best_model_rf.pkl",
        "best_model_xgb.pkl",
        "best_model_nn.h5",
        "best_model_cnn.h5",
        "best_model_prophet.pkl",
        "best_model_sarima.pkl"
    ]

    for file in files_priority:
        if os.path.exists(file):
            print(f"✅ Loaded model: {file}")
            if file.endswith(".h5"):
                return load_model(file), file
            else:
                return joblib.load(file), file

    print("❌ לא נמצא קובץ מודל שמור.")
    return None, None

# פונקציה לאימון מחדש על דאטה חדש


In [61]:
def retrain_best_model(new_data_path):
    """
    מאמן מחדש את המודל הטוב ביותר על דאטה חדש ושומר אותו מחדש.
    :param new_data_path: path ל־CSV חדש עם אותם עמודות כמו final_dataset.csv
    """
    model, model_file = load_best_model()
    if model is None:
        print("❌ אין מודל לשדרוג.")
        return

    # --- טעינת דאטה חדש ---
    new_df = pd.read_csv(new_data_path)
    new_df['QueueStartDate'] = pd.to_datetime(new_df['QueueStartDate'])
    new_df['Weekday'] = new_df['QueueStartDate'].dt.day_name()
    new_df['IsHoliday'] = new_df['IsHoliday'].fillna(0).astype(int)
    new_df['IsHolidayEve'] = new_df['IsHolidayEve'].fillna(0).astype(int)

    X_new = new_df[['Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve']]
    y_new = new_df['TotalAgents']

    print(f"🔄 Retraining model: {model_file} on {len(new_df)} new rows...")

    # --- אימון מחדש בהתאם לסוג המודל ---
    if "rf" in model_file or "xgb" in model_file:
        model.fit(X_new, y_new)
        joblib.dump(model, model_file)

    elif "nn" in model_file:
        X_new_nn = pre_nn.transform(X_new)
        model.fit(X_new_nn, y_new, epochs=10, batch_size=32, verbose=1)
        model.save(model_file)

    elif "cnn" in model_file:
        # יצירת רצפים חדשים עבור CNN
        new_df_sorted = new_df.sort_values(['QueueStartDate', 'Interval']).reset_index(drop=True)
        X_full_new = pre_nn.transform(new_df_sorted[['Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve']])
        y_full_new = new_df_sorted['TotalAgents'].values

        X_seq_new, y_seq_new = [], []
        for i in range(12, len(X_full_new)):
            X_seq_new.append(X_full_new[i-12:i])
            y_seq_new.append(y_full_new[i])

        X_seq_new, y_seq_new = np.array(X_seq_new), np.array(y_seq_new)
        model.fit(X_seq_new, y_seq_new, epochs=10, batch_size=32, verbose=1)
        model.save(model_file)

    elif "prophet" in model_file:
        prophet_models = model
        for interval in X_new['Interval'].unique():
            df_int = new_df[new_df['Interval'] == interval][['QueueStartDate', 'TotalAgents']].rename(
                columns={'QueueStartDate': 'ds', 'TotalAgents': 'y'})
            if df_int.empty:
                continue
            m = Prophet(daily_seasonality=True, weekly_seasonality=True, yearly_seasonality=True)
            m.fit(df_int)
            prophet_models[interval] = m
        joblib.dump(prophet_models, model_file)

    elif "sarima" in model_file:
        sarima_models = model
        for interval in X_new['Interval'].unique():
            df_int = new_df[new_df['Interval'] == interval].sort_values('QueueStartDate')
            y_int = df_int['TotalAgents'].values
            if len(y_int) < 15:
                continue
            sarima_models[interval] = SARIMAX(y_int, order=(1,1,1), seasonal_order=(1,1,1,7)).fit(disp=False)
        joblib.dump(sarima_models, model_file)

    print(f"✅ Model retrained and saved successfully as: {model_file}")

# פונקציית חיזוי לפי קלט חדש

In [62]:
import pandas as pd
import numpy as np

def predict_future(date: str):
    """
    מחזירה תחזית של אינטרוולים עתידיים בין 06:00 ל-19:00 בלבד (14 תחזיות),
    בהתבסס על המודל הטוב ביותר שנשמר.
    - טוענת את החגים ישירות מקובץ Holidays_New.csv
    - התחזית מחזירה מספרים שלמים בלבד וללא ערכים שליליים.
    """

    # --- טעינת רשימת החגים מקובץ ---
    holidays = pd.read_csv("Holidays_New.csv")
    holidays['Date'] = pd.to_datetime(holidays['Date'])

    # --- טוענים את המודל הטוב ביותר ---
    model, model_file = load_best_model()
    if model is None:
        print("❌ לא נמצא מודל שמור לחיזוי.")
        return None

    # --- הכנת תאריך וחג/ערב חג ---
    date = pd.to_datetime(date)
    weekday = date.day_name()
    is_holiday = 1 if date in holidays['Date'].values else 0
    is_holiday_eve = 1 if (date - pd.Timedelta(days=1)) in holidays['Date'].values else 0

    # ✅ הגבלת אינטרוולים לשעות 06:00 עד 19:00
    future_intervals = np.arange(7, 21)
    predictions = []

    print(f"✅ Predicting with model: {model_file}")

    # --- חיזוי לפי סוג המודל ---
    if "rf" in model_file or "xgb" in model_file:
        future_df = pd.DataFrame([
            [weekday, i, is_holiday, is_holiday_eve] for i in future_intervals
        ], columns=['Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve'])
        predictions = model.predict(future_df)

    elif "nn" in model_file:
        future_df = pd.DataFrame([
            [weekday, i, is_holiday, is_holiday_eve] for i in future_intervals
        ], columns=['Weekday', 'Interval', 'IsHoliday', 'IsHolidayEve'])
        future_nn = pre_nn.transform(future_df)
        predictions = model.predict(future_nn).flatten()

    elif "cnn" in model_file:
        last_seq = X_full[-12:]  # n_lags = 12
        seqs = np.array([last_seq for _ in range(len(future_intervals))])
        predictions = model.predict(seqs).flatten()

    elif "prophet" in model_file:
        prophet_models = model
        for interval in future_intervals:
            if interval in prophet_models:
                future_dates = pd.DataFrame({'ds': [date]})
                pred = prophet_models[interval].predict(future_dates)['yhat'].values[0]
                predictions.append(pred)
            else:
                predictions.append(0)

    elif "sarima" in model_file:
        sarima_models = model
        for interval in future_intervals:
            if interval in sarima_models:
                pred = sarima_models[interval].forecast(steps=1)[0]
                predictions.append(pred)
            else:
                predictions.append(0)

    # --- עיבוד תוצאה: מספרים שלמים וללא ערכים שליליים ---
    predictions = [max(0, int(round(p))) for p in predictions]

    # --- המרת Interval לשעת יום (HH:MM) ---
    interval_times = [
        (pd.Timestamp("00:00") + pd.Timedelta(hours=(i - 1))).strftime("%H:%M")
        for i in future_intervals
    ]

    # --- יצירת טבלת תוצאה ---
    result_df = pd.DataFrame({
        "Date": [date.date()] * len(future_intervals),
        "Time": interval_times,
        "IsHoliday": [is_holiday] * len(future_intervals),
        "IsHolidayEve": [is_holiday_eve] * len(future_intervals),
        "Predicted_TotalAgents": predictions
    })

    return result_df



In [63]:
# 🧪 דוגמה לשימוש בפונקציית חיזוי
future_date = "2025-07-01"
future_predictions = predict_future(future_date)
print(future_predictions)

✅ Loaded model: best_model_prophet.pkl
✅ Predicting with model: best_model_prophet.pkl
          Date   Time  IsHoliday  IsHolidayEve  Predicted_TotalAgents
0   2025-07-01  06:00          0             0                      1
1   2025-07-01  07:00          0             0                      8
2   2025-07-01  08:00          0             0                     76
3   2025-07-01  09:00          0             0                     87
4   2025-07-01  10:00          0             0                     85
5   2025-07-01  11:00          0             0                     83
6   2025-07-01  12:00          0             0                     81
7   2025-07-01  13:00          0             0                     74
8   2025-07-01  14:00          0             0                     66
9   2025-07-01  15:00          0             0                     46
10  2025-07-01  16:00          0             0                     21
11  2025-07-01  17:00          0             0                      6
12 

# ממשק אינטראקטיבי לחיזוי


In [45]:
import ipywidgets as widgets
from ipywidgets import interact
import plotly.express as px

def interactive_prediction(date):
    """
    מציג תחזית אינטראקטיבית עבור תאריך נתון (06:00–19:00)
    """
    if date is None:
        print("🔹 אנא בחר תאריך כדי להציג תחזית.")
        return

    df_pred = predict_future(date)

    if df_pred is None or df_pred.empty:
        print("❌ לא נמצאו תחזיות לתאריך זה.")
        return

    display(df_pred)

    # --- גרף תחזית ---
    fig = px.bar(
        df_pred,
        x='Time',
        y='Predicted_TotalAgents',
        title=f"<b>תחזית כמות נציגים – {date}</b>",
        text_auto=True
    )
    fig.update_traces(texttemplate='%{y}', textposition='outside')
    fig.show()

date_picker = widgets.DatePicker(
    description='בחר תאריך:',
    disabled=False
)

interact(interactive_prediction, date=date_picker)

interactive(children=(DatePicker(value=None, description='בחר תאריך:'), Output()), _dom_classes=('widget-inter…

# ממשק Gradio לחיזוי


In [None]:
# !pip install --upgrade gradio



In [None]:
# import gradio
# print(gradio.__version__)

5.37.0


In [None]:
import gradio as gr
from gradio.themes.base import Base

from datetime import datetime

def predict_interface(date_str):
    """
    פונקציית חיזוי לשימוש בממשק Gradio
    קלט: תאריך בפורמט dd/mm/yyyy (לדוגמה: 20/07/2025)
    """
    if not date_str:
        return "❌ אנא הזן תאריך בפורמט dd/mm/yyyy", None

    try:
        # ✅ המרה ל-YYYY-MM-DD עבור predict_future
        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
        date_iso = date_obj.strftime("%Y-%m-%d")

        df_pred = predict_future(date_iso)
    except ValueError:
        return "❌ פורמט שגוי. הזן תאריך כ: dd/mm/yyyy", None
    except Exception as e:
        return f"❌ שגיאה: {e}", None

    if df_pred is None or df_pred.empty:
        return "❌ לא נמצאו תחזיות לתאריך זה.", None

    # --- טבלה כטקסט ---
    table_str = df_pred.to_string(index=False)

    # --- גרף תחזית ---
    fig = px.bar(
        df_pred,
        x='Time',
        y='Predicted_TotalAgents',
        title=f"<b>תחזית כמות נציגים – {date_str}</b>",
        text_auto=True
    )
    fig.update_traces(texttemplate='%{y}', textposition='outside')

    return table_str, fig

# === ממשק Gradio ===
with gr.Blocks() as demo:
    gr.Markdown("## 📊 ממשק תחזית כמות נציגים (06:00–19:00)")

    date_input = gr.Textbox(
        label="תאריך (dd/mm/yyyy)",
        placeholder="לדוגמה: 20/07/2025"
    )
    output_text = gr.Textbox(label="תחזית בטבלה", lines=15)
    output_plot = gr.Plot(label="גרף תחזית")

    date_input.change(
        fn=predict_interface,
        inputs=date_input,
        outputs=[output_text, output_plot]
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://08b2c0c9fa036cbbd7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


