<a href="https://colab.research.google.com/github/Engbasemhamada/NYC-Taxi-Fare-Prediction/blob/main/DEPLOYMENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# train_and_save.py
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")

# ---------- CONFIG ----------
DATA_PATH = "final_internship_data.csv"   # ضع مسار الملف هنا
MODEL_OUT = "best_model.joblib"
# ----------------------------

df = pd.read_csv(DATA_PATH)

# تحديد الهدف: إن كان عمود 'target' موجود استخدمه وإلا آخر عمود
target_col = 'target' if 'target' in df.columns else df.columns[-1]
X = df.drop(columns=[target_col])
y = df[target_col]

# تحديد نوع المهمة
is_numeric = np.issubdtype(y.dtype, np.number)
unique_vals = y.nunique()
task = 'regression' if is_numeric and unique_vals > 20 else 'classification'
print("Task detected:", task, "| target:", target_col, "| unique:", unique_vals)

# لو تصنيف وعدد فئات قليل — label encode
label_encoder = None
if task == 'classification' and not is_numeric:
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y.astype(str))
elif task == 'classification' and is_numeric:
    y = y.astype(int)

# أعمدة عددية وفئوية
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(exclude=['number']).columns.tolist()
print("Num cols:", len(num_cols), "Cat cols:", len(cat_cols))

# إعداد OneHotEncoder بشكل متوافق مع كل الإصدارات
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

num_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
cat_pipe = Pipeline([
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ohe', ohe)
])

preproc = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
], remainder='drop')

# اختيار الموديل
if task == 'classification':
    model = RandomForestClassifier(random_state=42, n_jobs=-1)
    scoring = 'accuracy'
else:
    model = RandomForestRegressor(random_state=42, n_jobs=-1)
    scoring = 'r2'

pipe = Pipeline([('pre', preproc), ('clf', model)])

# تقسيم البيانات
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# بحث سريع للهايبر باراميتر
param_dist = {
    'clf__n_estimators': randint(80, 300),
    'clf__max_depth': randint(4, 40),
    'clf__min_samples_split': randint(2, 20),
    'clf__min_samples_leaf': randint(1, 10),
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__bootstrap': [True, False]
}

search = RandomizedSearchCV(
    pipe, param_dist, n_iter=10, cv=3,
    scoring=scoring, n_jobs=-1, random_state=42, verbose=1
)
print("Running hyperparameter search (short)...")
search.fit(X_train, y_train)

best = search.best_estimator_
print("Best params:", search.best_params_)

# التقييم
y_pred = best.predict(X_test)
if task == 'classification':
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    cv = cross_val_score(best, X, y, cv=3, scoring='accuracy', n_jobs=-1)
    print("CV mean acc:", cv.mean(), "std:", cv.std())
    metrics = {
        'test_accuracy': float(acc),
        'cv_mean_accuracy': float(cv.mean()),
        'cv_std': float(cv.std())
    }
else:
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    print("Test RMSE:", rmse)
    print("Test R2:", r2)
    cv = cross_val_score(best, X, y, cv=3, scoring='r2', n_jobs=-1)
    print("CV mean R2:", cv.mean(), "std:", cv.std())
    metrics = {
        'test_rmse': float(rmse),
        'test_r2': float(r2),
        'cv_mean_r2': float(cv.mean()),
        'cv_std': float(cv.std())
    }

# حفظ النموذج
joblib.dump({
    'pipeline': best,
    'label_encoder': label_encoder,
    'task': task,
    'target_col': target_col
}, MODEL_OUT)
print("Saved model to", MODEL_OUT)
print("SUMMARY:", metrics)


In [None]:
Django>=4.0
scikit-learn
joblib
pandas


In [None]:
# mlapp/views.py
from django.shortcuts import render
from django.conf import settings
import joblib
import pandas as pd
import os

MODEL_PATH = os.path.join(settings.BASE_DIR, 'best_model.joblib')  # انسخ الملف هنا أو ضع المسار الصحيح
model_bundle = joblib.load(MODEL_PATH)
pipeline = model_bundle['pipeline']
label_encoder = model_bundle.get('label_encoder')
task = model_bundle.get('task')

def home(request):
    result = None
    if request.method == 'POST' and request.FILES.get('file'):
        csvfile = request.FILES['file']
        df = pd.read_csv(csvfile)
        # إذا الملف يحوي صفوف متعددة، سنعرض أول 10 تنبؤات
        preds = pipeline.predict(df)
        if label_encoder is not None:
            preds = label_encoder.inverse_transform(preds.astype(int))
        result = preds.tolist()[:50]  # safety
    return render(request, 'mlapp/home.html', {'result': result, 'task': task})


In [None]:
# mlapp/urls.py
from django.urls import path
from . import views

urlpatterns = [
    path('', views.home, name='home'),
]


In [None]:
<!doctype html>
<html>
<head>
  <meta charset="utf-8">
  <title>ML Predictor</title>
  <style>
    body{font-family: Arial; max-width:900px;margin:20px auto;padding:20px;}
    .card{box-shadow:0 4px 12px rgba(0,0,0,0.08);padding:18px;border-radius:10px;}
    input[type=file]{display:block;margin:12px 0}
    button{padding:10px 14px;border-radius:8px;border:0;background:#2563eb;color:white;cursor:pointer}
    .result{margin-top:16px}
  </style>
</head>
<body>
  <div class="card">
    <h2>Upload CSV (features only)</h2>
    <form method="post" enctype="multipart/form-data">{% csrf_token %}
      <input type="file" name="file" accept=".csv" required>
      <button type="submit">Predict</button>
    </form>

    {% if result %}
      <div class="result">
        <h3>Predictions (first rows):</h3>
        <pre>{{ result|safe }}</pre>
        <p>Task: {{ task }}</p>
      </div>
    {% endif %}
  </div>
</body>
</html>
