In [3]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Load the Global Superstore dataset from the provided file path
file_path = r"D:\data science QT\notes\DataSets\Global Superstore\Global Superstore.xls"
excel_file = pd.ExcelFile(file_path)

# Display the sheet names
print(excel_file.sheet_names)

# Load the data from the first sheet (or specify the sheet you want)
df = excel_file.parse(excel_file.sheet_names[0])

# Display the first few rows of the dataset
print(df.head())

# Split data into train and test sets (for Iris dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the Iris dataset features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Data preprocessing for Iris dataset completed.")


['Orders', 'Returns', 'People']
   Row ID         Order ID Order Date  Ship Date     Ship Mode Customer ID  \
0   32298   CA-2012-124891 2012-07-31 2012-07-31      Same Day    RH-19495   
1   26341    IN-2013-77878 2013-02-05 2013-02-07  Second Class    JR-16210   
2   25330    IN-2013-71249 2013-10-17 2013-10-18   First Class    CR-12730   
3   13524  ES-2013-1579342 2013-01-28 2013-01-30   First Class    KM-16375   
4   47221     SG-2013-4320 2013-11-05 2013-11-06      Same Day     RH-9495   

      Customer Name      Segment           City            State  ...  \
0       Rick Hansen     Consumer  New York City         New York  ...   
1     Justin Ritter    Corporate     Wollongong  New South Wales  ...   
2      Craig Reiter     Consumer       Brisbane       Queensland  ...   
3  Katherine Murray  Home Office         Berlin           Berlin  ...   
4       Rick Hansen     Consumer          Dakar            Dakar  ...   

         Product ID    Category Sub-Category  \
0   TEC-AC-1

In [4]:
pip install mlflow




In [5]:
import mlflow

In [6]:
import os
os.environ['GIT_PYTHON_REFRESH']='quiet'
#os.environ['MLFLOW_TRACKING_URI']='file:///Users/JANARDHAN/Documents/Janardhan/m1/e1'
import os
os.environ['MLFLOW_TRACKING_URI'] = 'file:///D:/data science QT/Mlops'


In [7]:
mlflow.set_experiment("iris-analysis")
mlflow.set_experiment_tag('iris-tag','ex1')

In [8]:
# End any active run before starting a new one
if mlflow.active_run():
    mlflow.end_run()

# Start a new run
with mlflow.start_run():    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import accuracy_score
    
    # Model object initialization
    RandFor = RandomForestClassifier(n_estimators=5000, max_depth=6)
    GBM = GradientBoostingClassifier(n_estimators=3000, max_depth=6, learning_rate=0.01, min_samples_split=5)
    rf_n_estimators = 5000

    # Logging parameters
    mlflow.log_param("n_estimators", rf_n_estimators)
    mlflow.log_param('rf_max_depth', 6)
    mlflow.log_param('gbm_n_estimators', 3000)
    mlflow.log_param('gbm_max_depth', 6)
    mlflow.log_param('gbm_learning_rate', 0.01)
    mlflow.log_param('gbm_minimum_samples_split', 5)
    
    # Training models
    rf = RandFor.fit(X_train, y_train)
    gbm = GBM.fit(X_train, y_train)
    
    # Training scores
    training_score_rf = RandFor.score(X_train, y_train)
    training_score_gbm = GBM.score(X_train, y_train)
    
    # Logging training scores
    mlflow.log_metric("rf_training_score", training_score_rf)
    mlflow.log_metric("gbm_training_score", training_score_gbm)
    
    # Value predictions
    y_pred_rf = rf.predict(X_test)
    y_pred_gbm = gbm.predict(X_test)
    
    # Prediction evaluations
    testing_score_rf = accuracy_score(y_pred_rf, y_test)
    testing_score_gbm = accuracy_score(y_pred_gbm, y_test)
    
    # Logging evaluation scores
    mlflow.log_metric("rf_testing_score", testing_score_rf)
    mlflow.log_metric("gbm_testing_score", testing_score_gbm)
    
    # Select the first row of the numpy array and convert it into a dictionary-like structure
    input_example_rf = dict(zip([f'feature_{i}' for i in range(X_train.shape[1])], X_train[0]))
    input_example_gbm = dict(zip([f'feature_{i}' for i in range(X_train.shape[1])], X_train[0]))
    
    # Logging models with input examples
    mlflow.sklearn.log_model(rf, "random-forest-model", input_example=input_example_rf)
    mlflow.sklearn.log_model(gbm, "gradient-boosting-model", input_example=input_example_gbm)
    
    # Setting tags
    mlflow.set_tag("iteration_name", "test_rf_gb_sklearn")
    mlflow.set_tag("feature_set_version", "loan default")




# mlflow ui --backend-store-uri='file:///Users/JANARDHAN/Documents/Janardhan/m2'

In [7]:
#!pip install mlflow

In [9]:
!python -m pip install --upgrade pip



In [10]:
!python -m pip install mlflow


