In [1]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error


In [2]:
train=pd.read_csv(r"/train_cleaned.csv")
test=pd.read_csv(r"/test.csv")

In [None]:
train['Weekend'] = train['Weekend'].apply(
    lambda x: 1 if str(x).lower() == 'Weekend' else 0
)
print("✅ weekend column converted to numeric.")


label_cols = ['Week_day', 'day_type']
for col in label_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    print(f"✅ Encoded '{col}' successfully. Classes: {list(le.classes_)}")


✅ weekend column converted to numeric.
✅ Encoded 'Week_day' successfully. Classes: ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
✅ Encoded 'day_type' successfully. Classes: ['Additional', 'Bridge', 'Event', 'Holiday', 'Transfer', 'Work Day']


In [None]:
test['Weekend'] = test['Weekend'].apply(
    lambda x: 1 if str(x).lower() == 'Weekend' else 0
)
print("✅ weekend column converted to numeric.")


label_cols = ['Week_day', 'day_type']
for col in label_cols:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])
    print(f"✅ Encoded '{col}' successfully. Classes: {list(le.classes_)}")


✅ weekend column converted to numeric.
✅ Encoded 'Week_day' successfully. Classes: ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday']
✅ Encoded 'day_type' successfully. Classes: ['Holiday', 'Work Day']


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 17 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   store_nbr    int64  
 2   family       object 
 3   sales        float64
 4   onpromotion  float64
 5   city         object 
 6   state        object 
 7   store_type   object 
 8   cluster      int64  
 9   dcoilwtico   float64
 10  day_type     int64  
 11  Year         int64  
 12  Month        int64  
 13  Day          int64  
 14  Week_day     int64  
 15  Weekend      int64  
 16  quarter      int64  
dtypes: float64(3), int64(10), object(4)
memory usage: 389.2+ MB


In [None]:
train_date = pd.to_datetime(dict(year=train["Year"], month=train["Month"], day=train["Day"]), errors="coerce")


train["quarter"] = train_date.dt.quarter.fillna(0).astype(int)

# رقم الأسبوع في السنة (مع معالجة القيم الفارغة)
train["weekofyear"] = train_date.dt.isocalendar().week.fillna(0).astype(int)

# بداية ونهاية الشهر
train["is_month_start"] = train_date.dt.is_month_start.fillna(False).astype(int)
train["is_month_end"] = train_date.dt.is_month_end.fillna(False).astype(int)

# أول وأخر أسبوع في الشهر
train["is_early_month"] = (train["Day"] <= 7).astype(int)
train["is_late_month"] = (train["Day"] >= 25).astype(int)

# ترتيب البيانات حسب التاريخ الحقيقي
train = train.sort_values(["Year", "Month", "Day"]).reset_index(drop=True)

# Lag features
train["sales_lag_1"] = train["sales"].shift(1)
train["sales_lag_7"] = train["sales"].shift(7)
train["sales_lag_30"] = train["sales"].shift(30)

# تعويض القيم الفارغة
train = train.fillna(0)




In [None]:
# ✅ استخدم test نفسه مش train
test_date = pd.to_datetime(
    dict(year=test["Year"], month=test["Month"], day=test["Day"]),
    errors="coerce"
)

# رقم الأسبوع في السنة
test["weekofyear"] = test_date.dt.isocalendar().week.fillna(0).astype(int)

# بداية ونهاية الشهر
test["is_month_start"] = test_date.dt.is_month_start.fillna(False).astype(int)
test["is_month_end"] = test_date.dt.is_month_end.fillna(False).astype(int)

# أول أسبوع وآخر أسبوع في الشهر
test["is_early_month"] = (test["Day"] <= 7).astype(int)
test["is_late_month"] = (test["Day"] >= 25).astype(int)

# =====================================================
# 🧩 Lag Features مؤقتة (test مفيهوش sales)
# =====================================================
test["sales_lag_1"] = 0
test["sales_lag_7"] = 0
test["sales_lag_30"] = 0

# =====================================================
#  التعامل مع أي قيم مفقودة
# ===============


In [8]:
# معالجة القيم الغلط في الأعمدة الزمنية
for col in ["Year", "Month", "Day"]:
    train[col] = pd.to_numeric(train[col], errors="coerce")  # يحوّل القيم الغلط لـ NaN
    test[col] = pd.to_numeric(test[col], errors="coerce")

# نملأ القيم المفقودة (NaN أو 0) بالقيم المنطقية
train[["Year", "Month", "Day"]] = train[["Year", "Month", "Day"]].replace(0, np.nan)
train[["Year", "Month", "Day"]] = train[["Year", "Month", "Day"]].fillna(method="ffill")

test[["Year", "Month", "Day"]] = test[["Year", "Month", "Day"]].replace(0, np.nan)
test[["Year", "Month", "Day"]] = test[["Year", "Month", "Day"]].fillna(method="ffill")

# دلوقتي نقدر نحسب التاريخ بأمان 👇
train_date = pd.to_datetime(dict(year=train["Year"], month=train["Month"], day=train["Day"]))
test_date = pd.to_datetime(dict(year=test["Year"], month=test["Month"], day=test["Day"]))


  train[["Year", "Month", "Day"]] = train[["Year", "Month", "Day"]].fillna(method="ffill")
  test[["Year", "Month", "Day"]] = test[["Year", "Month", "Day"]].fillna(method="ffill")


In [9]:
train["season"] = train["Month"].map({
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Autumn", 10: "Autumn", 11: "Autumn"
})
test["season"] = test["Month"].map({
    12: "Winter", 1: "Winter", 2: "Winter",
    3: "Spring", 4: "Spring", 5: "Spring",
    6: "Summer", 7: "Summer", 8: "Summer",
    9: "Autumn", 10: "Autumn", 11: "Autumn"
})

# اليوم رقم كام في السنة
train["day_of_year"] = pd.to_datetime(
    dict(year=train["Year"], month=train["Month"], day=train["Day"])
).dt.dayofyear
test["day_of_year"] = pd.to_datetime(
    dict(year=test["Year"], month=test["Month"], day=test["Day"])
).dt.dayofyear

# تمثيل دوري للشهر واليوم (sin/cos)
for df in [train, test]:
    df["month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)
    df["day_sin"] = np.sin(2 * np.pi * df["Day"] / 31)
    df["day_cos"] = np.cos(2 * np.pi * df["Day"] / 31)

In [10]:
# تحويل season من نص إلى رقم
season_map = {
    "Winter": 0,
    "Spring": 1,
    "Summer": 2,
    "Autumn": 3
}

# تطبيق التحويل على train و test
train["season"] = train["season"].map(season_map)
test["season"] = test["season"].map(season_map)




In [11]:
# ==========================
# يوم السنة وتمثيله دوريًا
# ==========================
train["day_of_year"] = pd.to_datetime(dict(year=train["Year"], month=train["Month"], day=train["Day"])).dt.dayofyear
test["day_of_year"] = pd.to_datetime(dict(year=test["Year"], month=test["Month"], day=test["Day"])).dt.dayofyear

for df in [train, test]:
    df["day_year_sin"] = np.sin(2*np.pi*df["day_of_year"]/365)
    df["day_year_cos"] = np.cos(2*np.pi*df["day_of_year"]/365)

# ==========================
# Lag & Rolling Features للـ train
# ==========================
lags = [1,7,30]
for lag in lags:
    train[f"lag_{lag}"] = train["sales"].shift(lag)
    train[f"rolling_mean_{lag}"] = train["sales"].shift(1).rolling(window=lag).mean().fillna(0)
    train[f"rolling_std_{lag}"] = train["sales"].shift(1).rolling(window=lag).std().fillna(0)

# ==========================
# Lag Features للـ test (مؤقتًا بالصفر)
# ==========================
for lag in lags:
    test[f"lag_{lag}"] = 0
    test[f"rolling_mean_{lag}"] = 0
    test[f"rolling_std_{lag}"] = 0


In [12]:
# ============ 🧠 تحسينات جديدة على الـ Features ============

# 🔹 1) تمثيل دوري لأيام الأسبوع (يساعد النموذج يفهم الطبيعة الدورية للأيام)
for df in [train, test]:
    df["day_of_week_sin"] = np.sin(2 * np.pi * df["Week_day"] / 7)
    df["day_of_week_cos"] = np.cos(2 * np.pi * df["Week_day"] / 7)

# 🔹 2) حساب ربع السنة لكل من train و test
for df in [train, test]:
    df["quarter"] = df["Month"].apply(lambda m: (m - 1)//3 + 1)

# 🔹 3) اتجاه المبيعات مقارنة بالمتوسط (مؤشر اتجاه trend)
train["sales_trend_7"] = train["sales"] - train["rolling_mean_7"]
train["sales_trend_30"] = train["sales"] - train["rolling_mean_30"]

# 🔹 4) الفرق النسبي عن المتوسط (normalized difference)
train["diff_rolling_7"] = train["sales_trend_7"] / (train["rolling_std_7"] + 1e-6)
train["diff_rolling_30"] = train["sales_trend_30"] / (train["rolling_std_30"] + 1e-6)

# 🔹 5) إحصائيات شهرية (متوسط وانحراف معياري للمبيعات الشهرية)
monthly_stats = train.groupby(["Year", "Month"])["sales"].agg(
    monthly_avg_sales="mean",
    monthly_std_sales="std"
).reset_index()

# دمج الإحصائيات الشهرية مع train
train = train.merge(monthly_stats, on=["Year", "Month"], how="left")
train["monthly_std_sales"] = train["monthly_std_sales"].fillna(0)

# 🔹 6) إضافة نفس الأعمدة لـ test (بقيمة صفرية مؤقتًا)
for col in ["sales_trend_7", "sales_trend_30", "diff_rolling_7", "diff_rolling_30", "monthly_avg_sales", "monthly_std_sales"]:
    test[col] = 0


In [13]:
train.columns = train.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test.columns = test.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

In [14]:
features = [
    # 🧩 الأساسيات
    'Day', 'Month', 'Year',
    'Week_day', 'day_type', 'Weekend',

    # 🗓️ التحليل الزمني
    'weekofyear', 'is_month_start', 'is_month_end',
    'is_early_month', 'is_late_month',
    'day_of_year', 'quarter',  # ← تمت إضافة ربع السنة

    # 🌤️ الموسمية والتمثيل الدوري
    'season', 'month_sin', 'month_cos', 'day_sin', 'day_cos',
    'day_of_week_sin', 'day_of_week_cos',  # ← تمثيل دوري لأيام الأسبوع

    # 📊 مميزات المبيعات السابقة (Lag & Rolling)
    'lag_1', 'lag_7', 'lag_30',
    'rolling_mean_7', 'rolling_mean_30',
    'rolling_std_7', 'rolling_std_30',

    # 📈 مميزات الاتجاه والانحراف
    'sales_trend_7', 'sales_trend_30',       # ← اتجاه المبيعات
    'diff_rolling_7', 'diff_rolling_30',     # ← الفرق النسبي عن المتوسط

    # 🧮 الإحصائيات الشهرية
    'monthly_avg_sales', 'monthly_std_sales'
]


# 🎯 العمود الهدف (Target)
target = 'sales'

# تجهيز بيانات التدريب
X = train[features]
y = train[target]


In [15]:
train = train.dropna().reset_index(drop=True)

In [16]:
train.describe()

Unnamed: 0,id,store_nbr,sales,onpromotion,cluster,dcoilwtico,day_type,Year,Month,Day,...,rolling_mean_30,rolling_std_30,day_of_week_sin,day_of_week_cos,sales_trend_7,sales_trend_30,diff_rolling_7,diff_rolling_30,monthly_avg_sales,monthly_std_sales
count,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,...,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0,3000858.0
mean,1500458.0,27.50026,2.926397,0.3761537,8.481436,67.92465,4.608092,2014.838,6.207891,15.63019,...,2.926375,2.471825,-0.00010492,-2.39473e-05,6.278958e-06,2.139801e-05,6619.812,2909.366,2.926377,2.633796
std,866273.2,15.58564,2.695119,0.9026774,4.649736,25.66914,1.034243,1.345512,3.385645,8.794711,...,1.023922,0.5653879,0.7077176,0.7064956,2.696335,2.635195,126688.4,82696.02,0.5686188,0.05996036
min,30.0,1.0,0.0,0.0,1.0,26.19,0.0,2013.0,1.0,1.0,...,0.0,0.0,-0.9749279,-0.9009689,-7.601138,-5.555143,-6.084518,-2.452006,2.014049,2.512105
25%,750244.2,14.0,0.0,0.0,4.0,46.37,5.0,2014.0,3.0,8.0,...,2.463007,2.43003,-0.7818315,-0.9009689,-2.035036,-2.291443,-0.8316826,-0.8872881,2.238835,2.587846
50%,1500458.0,28.0,2.484907,0.0,8.0,53.41,5.0,2015.0,6.0,16.0,...,3.037369,2.566249,0.0,-0.2225209,-0.1459511,-0.4547723,-0.06163952,-0.1991811,3.04454,2.629492
75%,2250673.0,41.0,5.282462,0.0,13.0,95.72,5.0,2016.0,9.0,23.0,...,3.585,2.70542,0.7818315,0.6234898,2.277547,2.203982,0.9414162,0.852822,3.420877,2.67278
max,3000887.0,54.0,11.73381,6.609349,17.0,110.62,5.0,2017.0,12.0,31.0,...,5.781908,3.650398,0.9749279,1.0,8.869147,9.295488,8330623.0,8330623.0,3.655008,2.781489


## train model

In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, shuffle=False
)

# =====================================
# 🧩 STEP 2: إعداد بيانات LightGBM
# =====================================
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# =====================================
# ⚙️ STEP 3: إعداد وتحسين البارامترات
# =====================================
params = {
    "objective": "regression",
    "metric": ["rmse", "mae"],      # نضيف MAE كمان للمقارنة
    "learning_rate": 0.02,          # أبطأ لتقليل overfitting
    "num_leaves": 96,               # توازن أفضل بين التعقيد والـ generalization
    "feature_fraction": 0.8,        # تقليل خصائص (تحسين التعميم)
    "bagging_fraction": 0.8,        # تقليل overfitting أكثر
    "bagging_freq": 5,
    "lambda_l1": 1.0,               # L1 regularization
    "lambda_l2": 1.0,               # L2 regularization
    "min_data_in_leaf": 50,         # كل ورقة فيها بيانات كافية
    "verbose": -1,
    "n_jobs": -1,
    "random_state": 42
}

# =====================================
# 🧩 STEP 4: تدريب الموديل
# =====================================
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=["Train", "Valid"],
    num_boost_round=20000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=500),  # صبر أكثر
        lgb.log_evaluation(period=500)
    ]
)

# =====================================
# 🧩 STEP 5: التقييم على Validation
# =====================================
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
y_pred_val_clip = np.maximum(y_pred_val, 0)  # إزالة القيم السالبة

# حساب المقاييس
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val_clip))
mae = mean_absolute_error(y_val, y_pred_val_clip)
r2 = r2_score(y_val, y_pred_val_clip)
rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred_val_clip))

# عرض النتائج
print("\n📊 Improved Model Performance Summary")
print("===================================")
print(f" Validation RMSE:  {rmse:.4f}")
print(f" Validation MAE:   {mae:.4f}")
print(f" Validation RMSLE: {rmsle:.4f}")
print(f" Validation R²:    {r2:.4f}")
print("===================================\n")

# =====================================
# 🧩 STEP 6: تحليل أهمية الخصائص
# =====================================
plt.figure(figsize=(10, 6))
lgb.plot_importance(model, max_num_features=25)
plt.title("🔍 Top 25 Important Features (Improved Model)")
plt.show()

# =====================================
# 🧩 STEP 7: توقع على test.csv
# =====================================
X_test = test[features].fillna(0)
test_predictions = model.predict(X_test, num_iteration=model.best_iteration)

# =====================================
# 🧩 STEP 8: إنشاء ملف التسليم
# =====================================
# sample = pd.read_csv("sample_submission.csv")
# sample["sales"] = test_predictions
# sample.to_csv("my_submission.csv", index=False)
# print("✅ Submission file ready: my_submission.csv")

Training until validation scores don't improve for 500 rounds
[500]	Train's rmse: 0.0290281	Train's l1: 0.0157643	Valid's rmse: 0.0291719	Valid's l1: 0.0188971
[1000]	Train's rmse: 0.0243056	Train's l1: 0.0133301	Valid's rmse: 0.0249589	Valid's l1: 0.0162759
[1500]	Train's rmse: 0.0222178	Train's l1: 0.0122559	Valid's rmse: 0.0230932	Valid's l1: 0.015059
[2000]	Train's rmse: 0.0208398	Train's l1: 0.0115971	Valid's rmse: 0.0219155	Valid's l1: 0.0143009
[2500]	Train's rmse: 0.0198231	Train's l1: 0.011148	Valid's rmse: 0.021186	Valid's l1: 0.0138308
[3000]	Train's rmse: 0.0190081	Train's l1: 0.0107745	Valid's rmse: 0.0205633	Valid's l1: 0.0134132
[3500]	Train's rmse: 0.0183374	Train's l1: 0.010454	Valid's rmse: 0.0200923	Valid's l1: 0.0130629
[4000]	Train's rmse: 0.0177825	Train's l1: 0.0101855	Valid's rmse: 0.0196798	Valid's l1: 0.0127663
[4500]	Train's rmse: 0.017319	Train's l1: 0.00998343	Valid's rmse: 0.0194132	Valid's l1: 0.0125722
[5000]	Train's rmse: 0.0168934	Train's l1: 0.0097880

KeyboardInterrupt: 

In [20]:
# حفظ الموديل في ملف
# لو الموديل موجود في الذاكرة
model.save_model("my_partial_model.txt")


NameError: name 'model' is not defined

In [19]:
import lightgbm as lgb

# تحميل الموديل من الملف
model = lgb.Booster(model_file="my_model.txt")


LightGBMError: Could not open my_model.txt

In [None]:
import pandas as pd

# تلخيص الأداء
results = {
    "Metric": ["Train RMSE", "Validation RMSE", "Validation RMSLE", "Validation R²"],
    "Score": [0.4651, 0.3547, 0.2239, 0.9448]  # ضيف هنا قيمة الـ RMSLE اللي حسبتها
}

results_df = pd.DataFrame(results)

print("📊 Model Performance Summary")
print("="*35)
print(results_df.to_string(index=False))
print("\n✅ Model performed strongly with low validation error and high R² score.")
print("⚠️ Reminder: These metrics are from validation set only, not Kaggle test set.")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# حساب المقاييس
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
r2 = r2_score(y_val, y_pred_val)
mean_sales = y_val.mean()
error_percent = (rmse / mean_sales) * 100

# تقرير الأداء بشكل منسق
print("\n==============================")
print("📊 Model Performance on Validation Set")
print("==============================")
print(f"✅ RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"🏆 R² Score: {r2:.4f}")
print("------------------------------")
print(f"💰 Mean of Actual Sales: {mean_sales:.4f}")
print(f"⚠️ RMSE as % of Mean Sales: {error_percent:.2f}%")
print("==============================\n")

# تحليل تلقائي بناءً على النتائج
if r2 > 0.9:
    print("✅ Excellent model! It explains over 90% of the variance in sales.")
elif r2 > 0.75:
    print("👍 Good model. Still room for improvement with feature tuning.")
else:
    print("⚠️ Model performance is moderate. Try feature engineering or tuning parameters.")


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

plt.figure(figsize=(10, 6))
plt.title("📊 Model Training & Prediction Workflow", fontsize=14, fontweight="bold")

# Step positions
y_pos = [5, 4, 3, 2, 1]
steps = [
    "STEP 1️⃣: Prepare Train Data\n(df_encoding → X_train, X_val, y_train, y_val)",
    "STEP 2️⃣: Train Model\n(LightGBM using X_train, y_train)",
    "STEP 3️⃣: Evaluate Model\n(Validation RMSE, R², Errors)",
    "STEP 4️⃣: Predict on Test Data\n(model → test_encoding)",
    "STEP 5️⃣: Generate Submission File\n(sample_submission.csv → my_submission.csv)"
]

# Colors
colors = ["#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854"]

for i, (step, color) in enumerate(zip(steps, colors)):
    plt.scatter(0.5, y_pos[i], s=1800, color=color, edgecolors="k", alpha=0.9)
    plt.text(0.5, y_pos[i], step, ha="center", va="center", fontsize=10, color="black", fontweight="bold")

# Add arrows
for i in range(len(y_pos)-1):
    plt.arrow(0.5, y_pos[i]-0.4, 0, -0.2, head_width=0.05, head_length=0.15, fc='black', ec='black')

plt.xlim(0, 1)
plt.ylim(0.5, 5.5)
plt.axis("off")
plt.show()


In [None]:
# =====================================
# 📈 Visualization: Train vs Validation vs Predictions
# =====================================

import matplotlib.pyplot as plt
import numpy as np

# إنشاء المحور الزمني لكل جزء
time_train = np.arange(len(y_train))
time_val = np.arange(len(y_train), len(y_train) + len(y_val))

plt.figure(figsize=(14,6))

# القيم الحقيقية في التدريب
plt.plot(time_train, y_train, label="Train (True)", color="blue", linewidth=2)

# القيم الحقيقية في الـ validation
plt.plot(time_val, y_val, label="Validation (True)", color="green", linewidth=2, alpha=0.7)

# القيم المتنبأ بها في الـ validation
plt.plot(time_val, y_pred_val, label="Validation (Predicted)", color="orange", linestyle="--", linewidth=2)

# خط فاصل بين التدريب والتنبؤ
plt.axvline(x=len(y_train), color="red", linestyle="--", label="Forecast Start")

# تحسين المظهر العام
plt.title("📈 Sales Forecasting Timeline (LightGBM)", fontsize=14, weight='bold')
plt.xlabel("Time (Sequential Index)")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# =====================================
# 🧩 STEP 6: Feature Importance
# =====================================

import matplotlib.pyplot as plt
import seaborn as sns

# نحسب أهمية كل Feature
importance = model.feature_importance(importance_type="gain")
feature_names = X_train.columns

# نحولها إلى DataFrame للترتيب والعرض
feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

# عرض أول 20 Feature مثلاً
plt.figure(figsize=(10, 6))
sns.barplot(data=feat_imp.head(20), x="Importance", y="Feature", palette="viridis")
plt.title("🔥 Top 20 Important Features in LightGBM Model")
plt.xlabel("Feature Importance (Gain)")
plt.ylabel("Feature Name")
plt.tight_layout()
plt.show()




In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import pandas as pd

# =====================================
# 🧩 STEP 1: Prepare train data
# =====================================
date_features = [
    "Weekend",
    "Work_day",
    "Week_day_Monday",
    "Week_day_Saturday",
    "Week_day_Sunday",
    "Week_day_Thursday",
    "Week_day_Tuesday",
    "Week_day_Wednesday"
]

y = df_encoding["sales"]
X = df_encoding[date_features]

# split بطريقة زمنية
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, shuffle=False
)

# =====================================
# 🧩 STEP 2: Train LightGBM model
# =====================================
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 1.0,
    "bagging_fraction": 1.0,
    "verbose": -1,
    "random_state": 42
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=["Train", "Valid"],
    num_boost_round=10000,
    callbacks=[lgb.early_stopping(200)]
)

# =====================================
# 🧩 STEP 3: Evaluate
# =====================================
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

rmse = mean_squared_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)

print("\n📊 Model Performance Summary")
print("===================================")
print(f"Validation RMSE:  {rmse:.4f}")
print(f"Validation R²:    {r2:.4f}")
print("===================================\n")
