In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


Linear Regression (Bad)

In [30]:
# Load the training dataset
train_data = pd.read_excel("/kaggle/input/maersk-dataset/Training Dataset.xlsx")

# Drop any rows with missing values
train_data.dropna(inplace=True)

# Separate features and target variable
X = train_data.drop(columns=['Sourcing Cost'])
y = train_data['Sourcing Cost']

# Define categorical and numerical features
categorical_features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']
numerical_features = []

# Preprocessing pipeline
categorical_transformer = OneHotEncoder()
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)

print(f"Training R-squared score: {train_score:.4f}")
print(f"Validation R-squared score: {val_score:.4f}")

Training R-squared score: 0.1988
Validation R-squared score: 0.4187


Random Forest Regression (Also bad)

In [None]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [18]:
# Load the training dataset
train_data = pd.read_excel("/kaggle/input/maersk-dataset/Training Dataset.xlsx")


In [None]:

# Drop any rows with missing values
train_data.dropna(inplace=True)

# Separate features and target variable
X = train_data.drop(columns=['Sourcing Cost'])
y = train_data['Sourcing Cost']

# Define categorical and numerical features
categorical_features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Month of Sourcing']
numerical_features = ['Product Type']

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ],
    remainder='passthrough'  # to keep any remaining columns not specified above
)

# Combine preprocessing steps into a Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit the preprocessing pipeline
pipeline.fit(X)

# Transform the features
X_processed = pipeline.transform(X)

# Extract column names after preprocessing
processed_columns = (pipeline.named_steps['preprocessor']
                     .named_transformers_['cat']
                     .named_steps['onehot']
                     .get_feature_names_out(categorical_features) + 
                     numerical_features + 
                     [col for col in X.columns if col not in categorical_features + numerical_features])

# Create DataFrame with preprocessed features
processed_data = pd.DataFrame(X_processed, columns=processed_columns)

# Add target variable to the DataFrame
processed_data['Sourcing Cost'] = y

# Save the preprocessed features and target variable to a file
processed_data.to_excel("preprocessed_data.xlsx", index=False)


In [None]:
# Training

# Load the preprocessed data
processed_data = pd.read_excel("preprocessed_data.xlsx")

# Separate features and target variable
X = processed_data.drop(columns=['Sourcing Cost'])
y = processed_data['Sourcing Cost']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
train_score = model.score(X_train, y_train)
val_score = model.score(X_val, y_val)

print(f"Training R-squared score: {train_score:.4f}")
print(f"Validation R-squared score: {val_score:.4f}")


Gradient

Proper Preprocessing

In [37]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression

# Load the training dataset
train_data = pd.read_excel("/kaggle/input/maersk-dataset/Training Dataset.xlsx")

# Drop any rows with missing values
train_data.dropna(inplace=True)

# Separate features and target variable
X_train = train_data.drop(columns=['Sourcing Cost'])
y_train = train_data['Sourcing Cost']

# Define categorical and numerical features
categorical_features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type', 'Month of Sourcing']
numerical_features = []

# Preprocessing pipeline
categorical_transformer = OneHotEncoder()
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model.fit(X_train, y_train)



Neural Network

In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neural_network import MLPRegressor

# Load the dataset
data = pd.read_excel("/kaggle/input/maersk-dataset/Training Dataset.xlsx")

# Separate features and target variable
X = data.drop(columns=['Sourcing Cost'])
y = data['Sourcing Cost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical features
categorical_features = ['ProductType', 'Manufacturer', 'Area Code', 'Sourcing Channel', 'Product Size', 'Product Type']
numerical_features = ['Month of Sourcing']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the neural network regressor
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500))
])

# Train the model
model.fit(X_train, y_train)

# Predict the sourcing cost for the test dataset
y_pred = model.predict(X_test)

# Calculate the R-squared score
r_squared = r2_score(y_test, y_pred)
print(f"R-squared score for the test data: {r_squared:.4f}")

R-squared score for the test data: 0.5276


In [45]:
import joblib

# Define the file path for saving the model
model_path = "/kaggle/working/model.pkl"

# Save the trained model to a file
joblib.dump(model, model_path)

['/kaggle/working/model.pkl']

In [None]:
# import pandas as pd

# # Define the combinations
# combinations = {
#     'ProductType': ['NTM1', 'NTM2', 'NTM3'],
#     'Manufacturer': ['Manufacturer_X', 'Manufacturer_Y'],
#     'Area Code': ['Area_1', 'Area_2'],
#     'Sourcing Channel': ['Channel_1', 'Channel_2'],
#     'Product Size': ['Size_S', 'Size_M'],
#     'Product Type': ['Type_X', 'Type_Y']
# }

# # Create DataFrame for combinations
# test_combinations = pd.DataFrame(combinations)

# # Set 'Month of Sourcing' to 'June 2021'
# test_combinations['Month of Sourcing'] = 'June 2021'

# # Predict the sourcing cost for the test combinations
# y_pred_combinations = model.predict(test_combinations)

# # Add predicted sourcing cost to the DataFrame
# test_combinations['Predicted Sourcing Cost'] = y_pred_combinations

# # Display the DataFrame
# print(test_combinations)


In [53]:
import pandas as pd

# Read the Excel file
df = pd.read_excel("/kaggle/input/maersk-dataset/Test Dataset.xlsx")

# Print the columns
columns_to_print = ["ProductType", "Manufacturer", "Area Code", "Sourcing Channel", "Product Size", "Product Type", "Month of Sourcing"]
print(df[columns_to_print])


   ProductType Manufacturer Area Code Sourcing Channel Product Size  \
0         NTM1           X1        A1           DIRECT        Small   
1         NTM1           X1       A10           DIRECT        Large   
2         NTM1           X1       A10             ECOM        Large   
3         NTM1           X1       A11           DIRECT        Large   
4         NTM1           X1        A2           DIRECT        Large   
..         ...          ...       ...              ...          ...   
91        NTM3           X1       A44           DIRECT        Small   
92        NTM3           X1        A8           DIRECT        Large   
93        NTM3           X1        A8           DIRECT        Small   
94        NTM3           X2       A20           DIRECT        Large   
95        NTM3           X3       A22           RETAIL        Large   

   Product Type Month of Sourcing  
0        Powder        2021-06-21  
1        Powder        2021-06-21  
2        Powder        2021-06-21  
3  

In [63]:
type(model)

numpy.ndarray

In [None]:
# Prediction and Comparison

import pandas as pd
import numpy as np
import pickle

# Load the trained model
with open("/kaggle/working/model.pkl", "rb") as file:
    model = pickle.load(file)

# Define the features for prediction
data = {
    'ProductType': ['NTM1'],
    'Manufacturer': ['X1'],
    'Area Code': ['A1'],
    'Sourcing Channel': ['DIRECT'],
    'Product Size': ['Small'],
    'Product Type': ['Powder'],
    'Month of Sourcing': ['Jun-21']
}

# Create a DataFrame from the data
input_data = pd.DataFrame(data)

# Convert DataFrame to NumPy array
input_array = input_data.values

# Predict the sourcing cost
predicted_cost = model.predict(np.array(input_array))

# Print the predicted sourcing cost
print("Predicted Sourcing Cost:", predicted_cost[0])
