3. Model Development
Train-Test Split:

In [116]:
import pandas as pd

In [117]:
df_m = pd.read_csv("FAANG_clean.csv")

Normalize Numerical Features:

1. Min-Max Scaling

In [118]:
# Define the split ratio
train_size = int(len(df_m) * 0.8)

# Split the data
train = df_m[:train_size]  # First 80% for training
test = df_m[train_size:]   # Remaining 20% for testing

print("Training Set:\n", train)
print("Testing Set:\n", test)

Training Set:
            Open      High       Low     Close  Adj Close     Volume  \
0      1.156786  1.162679  1.117857  1.130179   0.954409  293751500   
1      1.139107  1.169107  1.124464  1.141786   0.964210  293751500   
2      1.151071  1.165179  1.143750  1.151786   0.972655  293751500   
3      1.154821  1.159107  1.130893  1.152679   0.973409  293751500   
4      1.160714  1.243393  1.156250  1.236607   1.044284  293751500   
...         ...       ...       ...       ...        ...        ...   
18439  3.951429  3.960000  3.847143  3.874286   3.874286    8561700   
18440  3.905714  4.024286  3.865714  4.002857   4.002857    8585500   
18441  3.948571  3.962857  3.827143  3.868571   3.868571   16476600   
18442  3.888571  3.897143  3.791429  3.881429   3.881429   10173100   
18443  4.007143  4.070000  3.895714  3.952857   3.952857    9859500   

       Stock_Amazon  Stock_Apple  Stock_Facebook  Stock_Google  ...  month_8  \
0             False         True           False    

In [119]:


from sklearn.preprocessing import MinMaxScaler
import pickle

# Create and fit the scaler

# Select numerical columns for scaling
numerical_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'year_encoded']


# Initialize Min-Max Scaler
scaler = MinMaxScaler()

# Apply scaling to numerical columns
df_m[numerical_columns] = scaler.fit_transform(df_m[numerical_columns])

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [120]:
df_m.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Stock_Amazon,Stock_Apple,Stock_Facebook,Stock_Google,...,month_8,month_9,month_10,month_11,month_12,day_of_week_Friday,day_of_week_Monday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,5.6e-05,1.1e-05,0.0,0.0,0.0,1.0,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1,0.0,3.1e-05,2.1e-05,3.6e-05,3.1e-05,1.0,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
2,3.8e-05,1.9e-05,8.2e-05,6.8e-05,5.7e-05,1.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,True
3,4.9e-05,0.0,4.1e-05,7.1e-05,6e-05,1.0,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,6.8e-05,0.000261,0.000122,0.000334,0.000283,1.0,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False


In [121]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split data into feature columns (X) and target column (y)
X = df_m.drop('Close', axis=1)  # Features
y = df_m['Close']  # Target variable

# Step 1: Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Save the train-test split for future reuse
with open('train_test_split.pkl', 'wb') as f:
    pickle.dump((X_train, X_test, y_train, y_test), f)

# Step 2: Initialize and apply Min-Max Scaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_test_scaled = scaler.transform(X_test)  # Transform the test data using the same scaler

# Save the fitted scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Step 3: Initialize models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(),
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor()
}

# Step 4: Train models, evaluate, and save the best model
best_model = None
best_r2 = float('-inf')  # To keep track of the best R^2 score
for name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions on test data

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name} - Evaluation Metrics:")
    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"R^2 Score: {r2}\n")

    # Save the model with its name
    
    
      
    

    # Update the best model if this one performs better
    if r2 > best_r2:
        best_r2 = r2
        best_model = model
        best_model_name = name

# Save the best model separately
best_model_filename = f"best_{best_model_name}.pkl"
with open(best_model_filename, 'wb') as f:
    pickle.dump(best_model, f)

print(f"Best model ({best_model_name}) saved as '{best_model_filename}'.")

LinearRegression - Evaluation Metrics:
Mean Absolute Error (MAE): 0.0011585753479498302
Mean Squared Error (MSE): 2.550111696421176e-06
R^2 Score: 0.9999854878974527

DecisionTree - Evaluation Metrics:
Mean Absolute Error (MAE): 0.0005580101035084823
Mean Squared Error (MSE): 1.2666905594079237e-06
R^2 Score: 0.999992791553672

RandomForest - Evaluation Metrics:
Mean Absolute Error (MAE): 0.0004968380056403004
Mean Squared Error (MSE): 9.125305289525944e-07
R^2 Score: 0.9999948069974219

XGBoost - Evaluation Metrics:
Mean Absolute Error (MAE): 0.0032935696803984725
Mean Squared Error (MSE): 4.2782394422733736e-05
R^2 Score: 0.9997565351761045

Best model (RandomForest) saved as 'best_RandomForest.pkl'.


Code for Model Training with Cross-Validation:

1. Linear Regression cross validation:

In [122]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Step 1: Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)  # Scale the feature data

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Initialize the Linear Regression model
lr_model = LinearRegression()

# Step 4: Train the model on the training data
lr_model.fit(X_train, y_train)

# Step 5: Evaluate using cross-validation on the training set
cv_scores_lr = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Step 6: Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)

# Step 7: Compute evaluation metrics on the test data
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Step 8: Display results
print(f"Linear Regression - Cross-validation MAE (Training Data): {-cv_scores_lr.mean()}")
print(f"Linear Regression - Test MAE: {mae_lr}")
print(f"Linear Regression - Test MSE: {mse_lr}")
print(f"Linear Regression - Test R² Score: {r2_lr}")

Linear Regression - Cross-validation MAE (Training Data): 0.0008203980680303941
Linear Regression - Test MAE: 0.0007926570871144738
Linear Regression - Test MSE: 1.5041531236535586e-06
Linear Regression - Test R² Score: 0.9999824139129193


2. Decision Tree Regressor cross validation: 

In [123]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)  # Scale the feature data

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Initialize the Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)

# Step 4: Train the model on the training data
dt_model.fit(X_train, y_train)

# Step 5: Cross-validation on the training set
cv_scores_dt = cross_val_score(dt_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Step 6: Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Step 7: Compute evaluation metrics on the test data
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)

# Step 8: Display results
print(f"Decision Tree - Cross-validation MAE (Training Data): {-cv_scores_dt.mean()}")
print(f"Decision Tree - Test MAE: {mae_dt}")
print(f"Decision Tree - Test MSE: {mse_dt}")
print(f"Decision Tree - Test R² Score: {r2_dt}")

Decision Tree - Cross-validation MAE (Training Data): 0.0004584667623752983
Decision Tree - Test MAE: 0.0004349774470855973
Decision Tree - Test MSE: 8.906397624169871e-07
Decision Tree - Test R² Score: 0.9999895869189293


3. Random Forest Regressor cross validation:

In [124]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)  # Scale the feature data

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Step 4: Train the model on the training data
rf_model.fit(X_train, y_train)

# Step 5: Cross-validation on the training set
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

# Step 6: Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Step 7: Compute evaluation metrics on the test data
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Step 8: Display results
print(f"Random Forest - Cross-validation MAE (Training Data): {-cv_scores_rf.mean()}")
print(f"Random Forest - Test MAE: {mae_rf}")
print(f"Random Forest - Test MSE: {mse_rf}")
print(f"Random Forest - Test R² Score: {r2_rf}")


Random Forest - Cross-validation MAE (Training Data): 0.00037165285642920055
Random Forest - Test MAE: 0.0003468186042685503
Random Forest - Test MSE: 4.766474912382188e-07
Random Forest - Test R² Score: 0.9999944271868629


4. XB BOOSTER  Regressor:

In [125]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Scale the features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)  # Scale the feature data

# Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 3: Initialize the XGBRegressor model
xgb_model = XGBRegressor(n_estimators=100, max_depth=6, random_state=42)

# Step 4: Train the model on the training data
xgb_model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Step 6: Compute evaluation metrics on the test data
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# Step 7: Display results
print(f"XGBoost - MAE: {mae_xgb}")
print(f"XGBoost - MSE: {mse_xgb}")
print(f"XGBoost - R² Score: {r2_xgb}")

XGBoost - MAE: 0.0010679946534014567
XGBoost - MSE: 5.019785669528343e-06
XGBoost - R² Score: 0.9999413102386169


 MLflow Integration


In [126]:
%pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [None]:
import mlflow
import mlflow.sklearn
import pickle
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("http://127.0.0.1:5000")  # Set to your MLflow tracking server URI
mlflow.set_experiment("FAANG Stock Price Predictions")

# Define function to log model and evaluation metrics to MLflow and save the best model
def log_model(model, model_name, X_train, X_test, y_train, y_test, params=None):
    with mlflow.start_run(run_name=model_name) as run:
        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_test_pred = model.predict(X_test)

        # Calculate evaluation metrics
        test_r2 = r2_score(y_test, y_test_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

        # Log metrics
        mlflow.log_metric("test_r2", test_r2)
        mlflow.log_metric("test_mse", test_mse)
        mlflow.log_metric("test_mae", test_mae)
        mlflow.log_metric("test_mape", test_mape)

        # Log parameters (if provided)
        if params:
            mlflow.log_params(params)

        # Log the model
        mlflow.sklearn.log_model(model, artifact_path="model")

        # Save the best model using pickle
        with open('best_RandomForest.pkl', 'wb') as f:
            pickle.dump(model, f)

        # Print the successful log message with run ID
        print(f"Model {model_name} logged and saved as 'best_RandomForest.pkl' successfully! Run ID: {run.info.run_id}")

    # Return the run ID for further reference
    return run.info.run_id

2025/01/13 00:45:23 INFO mlflow.tracking.fluent: Experiment with name 'FAANG Stock Price Predictions' does not exist. Creating a new experiment.


In [97]:
# import mlflow
# import mlflow.sklearn

# from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# # Set MLflow tracking URI and experiment
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.set_experiment("FAANG Stock Price Predictions ")


# def log_model(model, model_name, X_train, X_test, y_train, y_test, params=None):
#     with mlflow.start_run(run_name=model_name) as run:
#         # Train the model
#         model.fit(X_train, y_train)

#         # Predict on the test set
#         y_test_pred = model.predict(X_test)

#         # Calculate metrics
#         test_r2 = r2_score(y_test, y_test_pred)
#         test_mse = mean_squared_error(y_test, y_test_pred)
#         test_mae = mean_absolute_error(y_test, y_test_pred)
#         test_mape = mean_absolute_percentage_error(y_test, y_test_pred)

#         # Log metrics
#         mlflow.log_metric("test_r2", test_r2)
#         mlflow.log_metric("test_mse", test_mse)
#         mlflow.log_metric("test_mae", test_mae)
#         mlflow.log_metric("test_mape", test_mape)

#         # Log parameters
#         if params:
#             mlflow.log_params(params)

#         # Log the model
#         mlflow.sklearn.log_model(model, artifact_path="model")

#         print(f"Model {model_name} logged successfully! Run ID: {run.info.run_id}")

#     return run.info.run_id

In [128]:
# # model regirstration
# model_n ='rfr_model'
# run_id = input("Enter the runID: ")
# model_uri = f'runs:/{run_id}/model'

# with mlflow.start_run(run_id=run_id):
#     mlflow.register_model(model_uri= model_uri , name= model_n)
def register_model(run_id, model_name):
    model_uri = f"runs:/{run_id}/model"
    registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)
    print(f"Model {model_name} registered successfully! Version: {registered_model.version}")


In [129]:
# Step 1: Log the model
# Random Forest
run_id = log_model(rf_model,"Random Forest Regression", X_train, X_test, y_train, y_test, params=rf_model.get_params())
#  Linear Regression
log_model(lr_model, "Linear Regression", X_train, X_test, y_train, y_test, params=lr_model.get_params())

# Decision Tree Regression
log_model(dt_model, "Decision Tree", X_train, X_test, y_train, y_test, params=dt_model.get_params())

# Log Lasso Regression
log_model(xgb_model, "XGBoost Regression", X_train, X_test, y_train, y_test, params=xgb_model.get_params())


# Step 2: Register the model
register_model(run_id, "rf_model")



Model Random Forest Regression logged and saved as 'best_model.pkl' successfully! Run ID: 7a79f95435e745d3a5cf4061f44da66f
🏃 View run Random Forest Regression at: http://127.0.0.1:5000/#/experiments/383408949645555200/runs/7a79f95435e745d3a5cf4061f44da66f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383408949645555200




Model Linear Regression logged and saved as 'best_model.pkl' successfully! Run ID: 9e0482cf55b04f7b96cce4e5ec6c67fc
🏃 View run Linear Regression at: http://127.0.0.1:5000/#/experiments/383408949645555200/runs/9e0482cf55b04f7b96cce4e5ec6c67fc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383408949645555200




Model Decision Tree logged and saved as 'best_model.pkl' successfully! Run ID: 8eb5f0c7ec714fd4a3f07a5912b9c3ae
🏃 View run Decision Tree at: http://127.0.0.1:5000/#/experiments/383408949645555200/runs/8eb5f0c7ec714fd4a3f07a5912b9c3ae
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383408949645555200


Successfully registered model 'rf_model'.
2025/01/13 00:46:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_model, version 1


Model XGBoost Regression logged and saved as 'best_model.pkl' successfully! Run ID: 2d69a49dc15841edb0148d385f3d81fd
🏃 View run XGBoost Regression at: http://127.0.0.1:5000/#/experiments/383408949645555200/runs/2d69a49dc15841edb0148d385f3d81fd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/383408949645555200
Model rf_model registered successfully! Version: 1


Created version '1' of model 'rf_model'.


In [130]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Check registered model details
models = client.search_registered_models(filter_string="name='rf_model'")
for model in models:
    print(f"Model Name: {model.name}")
    for version in model.latest_versions:
        print(f"Version: {version.version}, Stage: {version.current_stage}, Source: {version.source}")

Model Name: rf_model
Version: 1, Stage: None, Source: mlflow-artifacts:/383408949645555200/7a79f95435e745d3a5cf4061f44da66f/artifacts/model


In [131]:
from mlflow.tracking import MlflowClient

# # Initialize MLflow client
client = MlflowClient()

# # Transition the model version to the "Staging" stage
model_name = "rf_model"
model_version = 1  # Replace with your model version
stage = "Staging"

client.transition_model_version_stage(
   name=model_name,
    version=model_version,
    stage=stage
)

print(f"Model {model_name} version {model_version} transitioned to stage {stage}.")

Model rf_model version 1 transitioned to stage Staging.


  client.transition_model_version_stage(


In [132]:
import mlflow.pyfunc

# Load the model by version (without stages)
model_uri = "models:/rf_model/1"  # specify the version number you want to load
model = mlflow.pyfunc.load_model(model_uri)

print("Model loaded successfully!")

Downloading artifacts: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


Model loaded successfully!


In [133]:
! streamlit run appstock.py

^C
