In [16]:
# load data from MongoDB and convert it into pandas DataFrame
  # Instead of reading from CSV again, let’s pull directly from MongoDB:
 
from pymongo import MongoClient
import pandas as pd
import os
from dotenv import load_dotenv



In [17]:
load_dotenv()
client = MongoClient(os.getenv("MONGO_URI"))
collection = client["phishing_db"]["raw_data"]

df= pd.DataFrame(list(collection.find()))
df.drop(columns=["_id"], inplace= True) # Drop MongoDB's default ID

In [18]:
# feature selection
X = df.drop(columns=["CLASS_LABEL"]) # queries all documents from MongoDB collection.
y= df["CLASS_LABEL"]

In [19]:
# scaling

from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
X_scaled= scaler.fit_transform(X)

In [20]:
# train-test-split

from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test= train_test_split(X_scaled,y, test_size= 0.2, random_state=42, stratify=y)


In [31]:
import joblib
import os

os.makedirs("artifacts", exist_ok=True)
joblib.dump(X_train, "artifacts/X_train.pkl")
joblib.dump(X_test, "artifacts/X_test.pkl")
joblib.dump(y_train, "artifacts/y_train.pkl")
joblib.dump(y_test, "artifacts/y_test.pkl")

['artifacts/y_test.pkl']

In [21]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))


In [22]:
from src.utils.logger import get_logger

logger = get_logger()
logger.info("Train-test split completed successfully.")


In [23]:
try:
    X_train, X_test, y_train, y_test = train_test_split(...)
    logger.info("✅ Train-test split completed successfully.")
except Exception as e:
    logger.error(f"❌ Train-test split failed: {e}")


In [24]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

In [25]:
from src.utils.preprocessing import scale_features, save_object

X_scaled, scaler = scale_features(X)
save_object(scaler, "scaler")
