In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
train_df = pd.read_csv('../data/raw/train_data.csv')
train_df.replace('-1', np.nan, inplace=True)
train_df.replace(np.float64(-1.0), np.nan, inplace=True)
submission_df = pd.read_csv('../data/raw/submission_data.csv')
submission_df.replace('-1', np.nan, inplace=True)
submission_df.replace(np.float64(-1.0), np.nan, inplace=True)
template_df = pd.read_csv('../data/processed/submission_template.csv')

In [None]:
date_cols = ["launch_date", "date"]  # Adjust as necessary
for col in date_cols:
    train_df[col] = pd.to_datetime(train_df[col], errors='coerce')
    train_df[f"{col}_year"] =  train_df[col].dt.year
    train_df[f"{col}_month"] = train_df[col].dt.month
    train_df[f"{col}_day"] =   train_df[col].dt.day

Vaig a implementar que el test set siguin els top 20% últims llençaments

In [None]:
perc_train_samples = 0.8

launches = train_df.groupby('cluster_nl')['launch_date'].first().reset_index()
launches = launches.sort_values('launch_date')
print(launches)
cutoff = int(len(launches) * perc_train_samples)
cutoff_launch_date = launches.iloc[cutoff]['launch_date']
print(f"Train cutoff: {cutoff_launch_date}")
train_cluster_nls = launches.iloc[:cutoff]['cluster_nl']
test_cluster_nls = launches.iloc[cutoff:]['cluster_nl']

In [None]:
print(f"All data shape: {train_df.shape}")
train_data = train_df.loc[train_df['cluster_nl'].isin(train_cluster_nls)]
print(f"Train data shape: {train_data.shape}")
test_data = train_df.loc[train_df['cluster_nl'].isin(test_cluster_nls)]
print(f"Test data shape: {test_data.shape}")

# Separate features and target for training and testing sets
X_train = train_data.drop(columns=['target'])
y_train = train_data['target']

X_test = test_data.drop(columns=['target'])
y_test = test_data['target']

# keep this convenient model to analyze the actual performance metric
metric_df = test_data[['cluster_nl', 'date', 'target']].copy()


In [None]:
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])

# 5. Hyperparameter Tuning
param_grid = {
    "regressor__n_estimators": [300],
    "regressor__learning_rate": [0.03, 0.1, 0.3],
    "regressor__max_depth": [7],
    "regressor__colsample_bytree": [0.7, 0.9, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [None]:
results = grid_search.cv_results_
mean_losses = results['mean_test_score']
results_df = pd.DataFrame(results)
# Create a pivot table with row = learning rate, col = regressor__colsample_bytree
pivot_table = results_df.pivot_table(index='param_regressor__learning_rate', columns='param_regressor__colsample_bytree', values='rank_test_score')
results_df

In [None]:
from helper import compute_metric, _metrics, unaveraged_CYME
def predict_and_measure_performance(model, X_test, y_test):
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    print(f"MSE:  {mse}")
    
    cyme = unaveraged_CYME(metric_df, y_pred)
    print(f"CYME: {cyme}")
    return mse, cyme

mse, cyme = predict_and_measure_performance(best_model, X_test, y_test)


In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_importance


# Extract the XGBRegressor from the pipeline
xgb_regressor = best_model.named_steps["regressor"]

# Handle feature names based on the preprocessor
preprocessor = best_model.named_steps["preprocessor"]

# Get transformed feature names
if hasattr(preprocessor, "get_feature_names_out"):
    feature_names = preprocessor.get_feature_names_out()
else:
    # If the preprocessor does not support this, use generic feature indices
    feature_names = [f"Feature_{i}" for i in range(xgb_regressor.feature_importances_.shape[0])]

# Get feature importances
importance = xgb_regressor.feature_importances_

# Create a DataFrame for sorting and visualization
importance_df = pd.DataFrame({
    "Feature": feature_names[:50],
    "Importance": importance[:50]
}).sort_values(by="Importance", ascending=False)

# Enhanced bar plot
plt.figure(figsize=(12, 8))
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=8)
plt.title("Feature Importance (Sorted)", fontsize=16)
plt.gca().invert_yaxis()  # Invert y-axis for highest importance at the top
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()

In [None]:
df_num = importance_df[importance_df.Feature.str.contains('num')]
plt.figure(figsize=(12, 8))
plt.barh(df_num["Feature"], df_num["Importance"], color="skyblue")
plt.xlabel("Importance", fontsize=14)
plt.ylabel("Features", fontsize=8)
plt.title("Feature Importance (Sorted)", fontsize=16)
plt.gca().invert_yaxis()  # Invert y-axis for highest importance at the top
plt.grid(axis="x", linestyle="--", alpha=0.7)
plt.show()

In [None]:
len(importance)

In [None]:
features_test = submission_df.drop(columns=['target'])
y = submission_df['target']
date_cols = ["launch_date", "date"]  # Adjust as necessary
for col in date_cols:
    features_test[col] = pd.to_datetime(features_test[col], errors='coerce')
    features_test[f"{col}_year"] = features_test[col].dt.year
    features_test[f"{col}_month"] = features_test[col].dt.month
    features_test[f"{col}_day"] = features_test[col].dt.day
features_test.drop(columns=date_cols, inplace=True)

In [None]:
missing_cols = set(X_train.columns) - set(features_test.columns)
for col in missing_cols:
    features_test[col] = 0 

new_data = features_test[X_train.columns]
predictions = best_model.predict(new_data)

In [None]:
new_data_with_predictions = new_data.copy()
new_data_with_predictions["prediction"] = predictions
date_cols = ["date"]
for col in date_cols:
    year_col = f"{col}_year"
    month_col = f"{col}_month"
    day_col = f"{col}_day"
    
    if all(c in new_data_with_predictions.columns for c in [year_col, month_col, day_col]):
        new_data_with_predictions[col] = pd.to_datetime(
            dict(year=new_data_with_predictions[year_col], 
                 month=new_data_with_predictions[month_col], 
                 day=new_data_with_predictions[day_col]),
            errors='coerce'
        )
        # Drop the individual year, month, and day columns if necessary
        new_data_with_predictions.drop(columns=[year_col, month_col, day_col], inplace=True)

In [None]:
final_df = new_data_with_predictions.drop(columns=['brand', 'che_pc_usd', 'che_perc_gdp', 'corporation',
       'country', 'drug_id', 'ind_launch_date', 'indication',
       'insurance_perc_che', 'population', 'prev_perc', 'price_month',
       'price_unit', 'public_perc_che', 'therapeutic_area', 'launch_date_year',
       'launch_date_month', 'launch_date_day', 'launch_date'])

In [None]:

template_df["date"] = pd.to_datetime(template_df["date"], errors="coerce")
final_df["date"] = pd.to_datetime(final_df["date"], errors="coerce")
filled_df = template_df.merge(
    final_df, 
    on=["date", "cluster_nl"], 
    how="left", 
    suffixes=("", "_pred")
)
filled_df["prediction"] = filled_df["prediction"].fillna(filled_df["prediction_pred"])
filled_df.drop(columns=["prediction_pred"], inplace=True)

In [None]:
filled_df

In [None]:
template_df

In [None]:
filled_df.to_csv('../data/outputs/try1.csv',index=False)