<a href="https://colab.research.google.com/github/BianchiLuca28/FHNW-BI-LMS/blob/main/notebooks/notebook1_luca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Importing dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
folder_path = "/content/drive/MyDrive/__Shared/BI"

In [4]:
df = pd.read_csv(folder_path + "/flattened_dataset.csv")
df.head()

Unnamed: 0,shipment_id,customer_price,expected_carrier_price,final_carrier_price,weight,shipment_type,insurance_type,domain_name,booking_state,lms_plus,...,month_delivery_date,quarter_delivery_date,full_date_real_pickup_date,year_real_pickup_date,month_real_pickup_date,quarter_real_pickup_date,full_date_real_delivery_date,year_real_delivery_date,month_real_delivery_date,quarter_real_delivery_date
0,4499940,8.64,5.92,4.96,1.5,Parcel,No_Insurance,DE,12,0,...,11.0,4.0,2023-01-10,2023,1,1,2023-01-11,2023,1,1
1,4505367,19.24,10.93,8.47,18.0,Parcel,No_Insurance,DE,12,0,...,12.0,4.0,2023-01-03,2023,1,1,2023-01-04,2023,1,1
2,4536217,19.16,10.89,8.46,18.0,Parcel,No_Insurance,DE,12,0,...,12.0,4.0,2023-01-03,2023,1,1,2023-01-04,2023,1,1
3,4554860,99.04,62.09,59.15,3.2,Parcel,No_Insurance,DE,12,0,...,12.0,4.0,2023-01-05,2023,1,1,2023-01-28,2023,1,1
4,4557691,6.7,5.88,4.63,2.0,Parcel,No_Insurance,DE,12,0,...,1.0,1.0,2022-12-20,2022,12,4,2022-12-21,2022,12,4


# Preprocessing

## Handling missing values

In [5]:
# columns with NAs (with more than 0)
df.isna().sum()[df.isna().sum() > 0]

Unnamed: 0,0
final_carrier_price,104174
margin,104174
industry,205983
segmentation,109702
delivery_postal_code,14071
name_pickup,27321
iso_country_code_pickup,27341
continent_pickup,27321
EU_pickup,27321
pickup_postal_code,1116


In [6]:
# Drop columns with too many missing values (more than 70% missing)
threshold = len(df) * 0.7
df = df.dropna(axis=1, thresh=threshold)

# Fill numeric columns with median and categorical columns with 'missing'
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].fillna('missing')

## Feature selection

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Step 1: Load and Prepare Dataset
# Assuming your dataset is loaded as 'df'
target_column = 'service_type'
df = df.drop(['shipment_id'], axis=1)  # Drop unnecessary identifier

# Step 2: Encode Categorical Target
le_target = LabelEncoder()
df[target_column] = le_target.fit_transform(df[target_column])

# Step 3: Split Features and Target
X = df.drop([target_column], axis=1)
y = df[target_column]

# Step 4: Simplified Encoding Strategy for Features
# Split categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Encode categorical features:
# Apply Label Encoding to high-cardinality categorical features
for col in categorical_cols:
    if X[col].nunique() > 10:  # High cardinality threshold (e.g., >10 unique values)
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
    else:
        X = pd.get_dummies(X, columns=[col], drop_first=True)  # One-Hot Encoding for low-cardinality columns

# Step 5: Train-Test Split for Feature Selection Purposes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Random Forest Feature Selection
# Train a RandomForestClassifier to determine feature importance
rf = RandomForestClassifier(n_estimators=50, random_state=42)  # Reduced number of estimators for simplicity
rf.fit(X_train, y_train)

# Get feature importances from the RandomForest model
feature_importances = pd.Series(rf.feature_importances_, index=X_train.columns)

# Set threshold for importance and select only the most important features
important_features = feature_importances[feature_importances > 0.01].index.tolist()  # Adjust threshold as needed

# Step 7: Create Final Dataset with Selected Features
X_selected_train = X_train[important_features]
X_selected_test = X_test[important_features]

# Output the selected features for further analysis
print("Selected Features:", important_features)

Selected Features: ['customer_price', 'expected_carrier_price', 'final_carrier_price', 'margin', 'iso_country_code_pickup', 'iso_country_code_pickup_country', 'created_date_service', 'service_name', 'carrier_name', 'full_date_delivery_date', 'continent_pickup_Europe', 'continent_pickup_country_Europe', 'transport_type_Standard']


## Feature scaling

In [13]:
# Step 1: Identify numerical features from the selected features
numerical_cols_train = X_selected_train.select_dtypes(include=['int64', 'float64']).columns
numerical_cols_test = X_selected_test.select_dtypes(include=['int64', 'float64']).columns

# Step 2: Apply Scaling to Numerical Columns
scaler = StandardScaler()

# Fit the scaler on the training set numerical features and transform both training and test set
X_selected_train[numerical_cols_train] = scaler.fit_transform(X_selected_train[numerical_cols_train])
X_selected_test[numerical_cols_test] = scaler.transform(X_selected_test[numerical_cols_test])

# Output to verify that scaling is done
print("Feature scaling applied to the following columns:", numerical_cols_train.tolist())

Feature scaling applied to the following columns: ['customer_price', 'expected_carrier_price', 'final_carrier_price', 'margin', 'iso_country_code_pickup', 'iso_country_code_pickup_country', 'created_date_service', 'service_name', 'carrier_name', 'full_date_delivery_date']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected_train[numerical_cols_train] = scaler.fit_transform(X_selected_train[numerical_cols_train])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected_test[numerical_cols_test] = scaler.transform(X_selected_test[numerical_cols_test])


# Exploratory Data Analysis

Unnamed: 0_level_0,count
service_type,Unnamed: 1_level_1
3,338463
0,300029
1,18038
4,265
2,7


# Model Training & Evaluation

In [15]:
# List of models to train
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced', multi_class='multinomial'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, scale_pos_weight=1)  # scale_pos_weight used to handle imbalance
}

# Prepare training and test data
X_train, X_test = X_selected_train, X_selected_test
y_train, y_test = y_train, y_test

# Initialize a dictionary to store the results
results = {}

# Train each model in a loop and evaluate its performance
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)

    # Store the results for comparison
    results[model_name] = {
        "Accuracy": accuracy,
        "Classification Report": class_report,
        "Confusion Matrix": confusion
    }

    # Print the performance metrics for the model
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion)
    print("\n" + "="*60 + "\n")

# Compare results across models
# Convert the accuracy scores to a DataFrame for easier visualization
accuracy_scores = {model: results[model]["Accuracy"] for model in results}
accuracy_df = pd.DataFrame(list(accuracy_scores.items()), columns=["Model", "Accuracy"])
print("Comparison of Model Accuracy:")
print(accuracy_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Random Forest
Accuracy: 0.9999
Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59994
           1       1.00      1.00      1.00      3624
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00     67694
           4       1.00      0.98      0.99        47

    accuracy                           1.00    131361
   macro avg       0.80      0.80      0.80    131361
weighted avg       1.00      1.00      1.00    131361

Confusion Matrix:
[[59992     0     0     2     0]
 [    3  3620     0     1     0]
 [    2     0     0     0     0]
 [    2     0     0 67692     0]
 [    1     0     0     0    46]]






Model: Logistic Regression
Accuracy: 0.9167
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.92      0.95     59994
           1       0.23      0.72      0.35      3624
           2       0.00      1.00      0.01         2
           3       0.99      0.93      0.96     67694
           4       0.35      1.00      0.52        47

    accuracy                           0.92    131361
   macro avg       0.51      0.91      0.56    131361
weighted avg       0.96      0.92      0.94    131361

Confusion Matrix:
[[54981  4365   481   126    41]
 [  323  2595     7   699     0]
 [    0     0     2     0     0]
 [  706  4145     2 62794    47]
 [    0     0     0     0    47]]




Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: XGBoost
Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     59994
           1       1.00      1.00      1.00      3624
           2       0.00      0.00      0.00         2
           3       1.00      1.00      1.00     67694
           4       1.00      1.00      1.00        47

    accuracy                           1.00    131361
   macro avg       0.80      0.80      0.80    131361
weighted avg       1.00      1.00      1.00    131361

Confusion Matrix:
[[59994     0     0     0     0]
 [    0  3623     0     1     0]
 [    2     0     0     0     0]
 [    1     0     0 67693     0]
 [    0     0     0     0    47]]


Comparison of Model Accuracy:
                 Model  Accuracy
0        Random Forest  0.999916
1  Logistic Regression  0.916703
2              XGBoost  0.999970


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
