In [38]:
import pandas as pd
import numpy as np
import sys
sys.path.append('/anaconda/envs/azureml_py38/lib/python3.8/site-packages')
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# File path
file_path = 'your_data.csv'

# Read the first two rows for parameters
with open(file_path, 'r') as file:
    first_line = file.readline().strip().split(',')
    second_line = file.readline().strip().split(',')

# Create dictionaries for standard and custom parameters
standard_params = ['DateGranularity', 'PeriodStartDate', 'PeriodEndDate', 'PredictionTimeWindow', 'MeasureColumn', 'TimeColumn']
params = dict(zip(first_line, second_line))
custom_params = [param for param in params if param not in standard_params]

# Validate and convert standard parameter values
params['PeriodStartDate'] = pd.to_datetime(params['PeriodStartDate'])
params['PeriodEndDate'] = pd.to_datetime(params['PeriodEndDate'])
params['PredictionTimeWindow'] = int(params['PredictionTimeWindow'])

# Define the TimeColumn and MeasureColumn based on standard parameters
TimeColumn = params['TimeColumn']
MeasureColumn = params['MeasureColumn']


# Read and print raw dimensions data for debugging
data = pd.read_csv(file_path, skiprows=lambda x: x in [0, 1, 2] or pd.isna(x), header=0)
print("Raw Dimensions Data (first 5 rows):")
print(data.head())
# Date range check
print(f"Date range in the file: {data[TimeColumn].min()} to {data[TimeColumn].max()}")

# Convert TimeColumn to datetime
data[TimeColumn] = pd.to_datetime(data[TimeColumn], errors='coerce')

# Filter data based on the date range
if not data[(data[TimeColumn] >= params['PeriodStartDate']) & (data[TimeColumn] <= params['PeriodEndDate'])].empty:
    data = data[(data[TimeColumn] >= params['PeriodStartDate']) & (data[TimeColumn] <= params['PeriodEndDate'])]
else:
    print("No data within the specified date range. Please check the PeriodStartDate and PeriodEndDate.")



In [40]:
# ... [previous script sections for reading the file]

# Print standard parameters
print("Standard Parameters:")
for param in standard_params:
    print(f"{param}: {params.get(param)}")
print("\n")

# Print custom parameters
print("Custom Parameters:")
for param in custom_params:
    print(f"{param}: {params[param]}")
print("\n")

# Print a snapshot of the dimensions data
print("Dimensions Data (first 5 rows):")
print(data.head())


Standard Parameters:
DateGranularity: M
PeriodStartDate: 2023-12-07 00:00:00
PeriodEndDate: 2025-12-06 00:00:00
PredictionTimeWindow: 24
MeasureColumn: _Value
TimeColumn: _Time


Custom Parameters:
P1: Value1
P2: Value 2


Dimensions Data (first 5 rows):
       _Time  _Value       ProductVariantName WarehouseLocationName  \
0 2018-01-01     420  Car Audio Unit-65-Black               Store 2   
1 2018-01-01     275       Car Audio Unit-500               Store 4   
2 2018-01-01     239  Car Audio Unit-65-Black               Store 4   
3 2018-01-01     401       Car Audio Unit-500   Distribution center   
4 2018-01-01     346       Car Audio Unit-500               Store 1   

   Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  
0         NaN         NaN         NaN         NaN  
1         NaN         NaN         NaN         NaN  
2         NaN         NaN         NaN         NaN  
3         NaN         NaN         NaN         NaN  
4         NaN         NaN         NaN         NaN  


In [35]:
# Aggregate data based on DateGranularity
if params['DateGranularity'] == 'M':
    data[TimeColumn] = data[TimeColumn].dt.to_period('M').dt.to_timestamp()
elif params['DateGranularity'] == 'W':
    data[TimeColumn] = data[TimeColumn].dt.to_period('W').dt.to_timestamp()
# Add conditions for other granularities if required

# Feature engineering: Extract year, month, day as separate columns
# Assuming 'data' is your DataFrame and 'TimeColumn' holds the correct column name
# Confirm the correct column name for the time column
print(f"Time column as per parameters: '{TimeColumn}'")

# Print the actual column names from the DataFrame for verification
print("Actual column names in DataFrame:")
print(data.columns.tolist())

# Check if TimeColumn exists in the DataFrame and convert it to datetime
if TimeColumn in data.columns:
    data[TimeColumn] = pd.to_datetime(data[TimeColumn], errors='coerce')

    # Feature engineering: Extract year, month, day as separate columns
    data['Year'] = data[TimeColumn].dt.year
    data['Month'] = data[TimeColumn].dt.month
    data['Day'] = data[TimeColumn].dt.day

    # Now you can drop the original time column as it's been replaced by more specific features
    data.drop(TimeColumn, axis=1, inplace=True)
else:
    print(f"Column '{TimeColumn}' not found in the data. Please check the DataFrame columns.")

Time column as per parameters: '_Time'
Actual column names in DataFrame:
['_Time', '_Value', 'ProductVariantName', 'WarehouseLocationName', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']


In [36]:
# List of columns to drop
columns_to_drop = ['_Value', 'ProductVariantName', 'WarehouseLocationName', 'P1', 'P2']

# Drop only if the column exists in the DataFrame
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

# Now drop the columns
X = data.drop(columns_to_drop, axis=1)
y = data['_Value']

# Ensure that the target variable '_Value' is converted to numeric
y = pd.to_numeric(y, errors='coerce')

# Drop any rows with NaN in the target variable
data.dropna(subset=['_Value'], inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = XGBRegressor()
model.fit(X_train, y_train)


In [37]:
# Generate future dates for prediction
future_dates = pd.date_range(start=params['PeriodEndDate'] + pd.Timedelta(days=1), periods=params['PredictionTimeWindow'], freq=params['DateGranularity'])

# Create a dataframe for future predictions
future_data = pd.DataFrame({TimeColumn: future_dates})
future_data['Year'] = future_data[TimeColumn].dt.year
future_data['Month'] = future_data[TimeColumn].dt.month
future_data['Day'] = future_data[TimeColumn].dt.day

# Add dummy columns for categorical features
for col in ['ProductVariantName_dummy', 'WarehouseLocationName_dummy']:
    future_data[col] = 0

# Ensure the column order in future_data matches the training data
future_data = future_data.reindex(columns=X_train.columns, fill_value=0)

# Predict future values
future_predictions = model.predict(future_data)

# Prepare the forecast dataframe
forecast = pd.DataFrame({TimeColumn: future_dates, MeasureColumn: future_predictions})
forecast['ProductVariantName'] = 'default_variant'  # Replace with actual values or logic
forecast['WarehouseLocationName'] = 'default_location'  # Replace with actual values or logic

# Export the forecast
forecast.to_csv('forecasted_output.csv', index=False)

In [4]:
# This part is usually done in a Python script or a Jupyter Notebook
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

# Create an environment from the environment.yml file
env = Environment.from_conda_specification(name="sklearn-env", file_path="./dependencies/conda.yaml")

# Define the inference configuration
inference_config = InferenceConfig(entry_script="./src/score.py", environment=env)


