In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
file_path = r'data_output\df_lav1.csv'
df_lav1 = pd.read_csv(file_path)

In [4]:
df_lav1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104376 entries, 0 to 104375
Data columns (total 15 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   Unnamed: 0                            104376 non-null  int64  
 1   Anag_oc_sintesi_progetto              104376 non-null  object 
 2   Prog_OC_COD_CATEGORIA_SPESA           104376 non-null  object 
 3   Altr_CUP_COD_NATURA                   104376 non-null  int64  
 4   Altr_CUP_COD_SETTORE                  104376 non-null  int64  
 5   Altr_OC_COD_TIPO_AIUTO                104376 non-null  object 
 6   Sogg_OC_DENOM_PROGRAMMATORE           104376 non-null  object 
 7   Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO  104245 non-null  object 
 8   New_prj_duration_pred                 104376 non-null  int64  
 9   New_marginal_cost                     104376 non-null  float64
 10  New_DEN_REGIONE_new                   104376 non-null  object 
 11  

In [5]:
df_lav1 = df_lav1.drop(columns=["Unnamed: 0","New_DEN_PROVINCIA_new", "New_marginal_cost", "Altr_CUP_COD_NATURA", "Altr_CUP_COD_SETTORE","New_DEN_COMUNE_new"])

In [6]:
#categorical_columns_to_force = ['Altr_CUP_COD_NATURA', 'Altr_CUP_COD_SETTORE', 'New_Risk']
categorical_columns_to_force = ['New_Risk']

df_lav1[categorical_columns_to_force] = df_lav1[categorical_columns_to_force].astype('category')

In [7]:
# Identifica i tipi di colonne
text_cols = ['Anag_oc_sintesi_progetto', 'Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO','Sogg_OC_DENOM_PROGRAMMATORE']
numeric_cols = ['New_prj_duration_pred', 'New_marginal_cost']
#categorical_cols = list(set(X.columns) - set(text_cols) - set(numeric_cols))
categorical_cols = list(set(df_lav1.columns) - set(numeric_cols)-set(text_cols))

In [8]:
nan_or_empty = df_lav1[text_cols].apply(lambda col: col.isna() | col.str.strip().eq(""))
df_lav1 = df_lav1[~nan_or_empty.any(axis=1)].copy()

### Dont consider invalid string for project description
We consider that project description is an important features (confirmed by feature importance permutation) so because we have a big dataset we consider dont loosing information if we don consider rows with invalid string for project description, we made this applying the following funcion. From the following code we see that we have about 15 k record dont acceptable but all in class 0 --- the most unbalaced, so we accept to delete.

In [9]:
import re

# Funzione per identificare stringhe non accettabili
def is_invalid_string(text):
    if pd.isna(text):  # Controllo per NaN
        return True
    text = text.strip()  # Rimuove spazi iniziali e finali
    # Controlla se il testo è vuoto, troppo corto o ha solo caratteri non significativi
    return len(text) < 4 or re.fullmatch(r"[.]+", text) is not None

# Applica il filtro alla colonna 'Anag_oc_sintesi_progetto'
invalid_rows = df_lav1["Anag_oc_sintesi_progetto"].apply(is_invalid_string)

# Seleziona i record con stringhe non accettabili
invalid_records = df_lav1[invalid_rows]

# Conta il numero di categorie target impattate
categories_impacted = invalid_records["New_Risk"].nunique()  # Sostituisci 'target' con il nome della tua colonna target

# Conta le occorrenze per ogni categoria
categories_count = invalid_records["New_Risk"].value_counts()

# Stampa i risultati
print("Record con stringhe non accettabili nella colonna 'Anag_oc_sintesi_progetto':")
print(invalid_records)
print(f"Numero di record con stringhe non accettabili: {len(invalid_records)}")
print(f"Numero di categorie target impattate: {categories_impacted}")
print("Conteggio dei record impattati per categoria target:")
print(categories_count)

Record con stringhe non accettabili nella colonna 'Anag_oc_sintesi_progetto':
      Anag_oc_sintesi_progetto Prog_OC_COD_CATEGORIA_SPESA  \
3116                       M+M                         055   
5275                       XXX                         001   
34002                     ....                         051   
34003                     ....                         051   
34004                     ....                         051   
...                        ...                         ...   
74141                       SA                         067   
74226                       SS                         067   
75074                        G                         067   
77539                        N                         067   
85222                      VGM                         067   

      Altr_OC_COD_TIPO_AIUTO Sogg_OC_DENOM_PROGRAMMATORE  \
3116                       F             REGIONE TOSCANA   
5275                       D             REGIONE TOSCANA 

In [10]:
df_lav1 = df_lav1[~invalid_rows]

In [11]:
#df_lav1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88852 entries, 0 to 104375
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   Anag_oc_sintesi_progetto              88852 non-null  object  
 1   Prog_OC_COD_CATEGORIA_SPESA           88852 non-null  object  
 2   Altr_OC_COD_TIPO_AIUTO                88852 non-null  object  
 3   Sogg_OC_DENOM_PROGRAMMATORE           88852 non-null  object  
 4   Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO  88852 non-null  object  
 5   New_prj_duration_pred                 88852 non-null  int64   
 6   New_DEN_REGIONE_new                   88852 non-null  object  
 7   New_Risk                              88852 non-null  category
 8   Cluster_Comune                        84408 non-null  object  
dtypes: category(1), int64(1), object(7)
memory usage: 6.2+ MB


In [12]:
X = df_lav1.drop('New_Risk', axis=1)
y = df_lav1['New_Risk']

In [13]:
# repeat it here to apply on the X with no target in
text_cols = ['Anag_oc_sintesi_progetto', 'Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO','Sogg_OC_DENOM_PROGRAMMATORE']
numeric_cols = ['New_prj_duration_pred']
#categorical_cols = list(set(X.columns) - set(text_cols) - set(numeric_cols))
categorical_cols = list(set(X.columns) - set(numeric_cols)-set(text_cols))

In [14]:
#X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88852 entries, 0 to 104375
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   Anag_oc_sintesi_progetto              88852 non-null  object
 1   Prog_OC_COD_CATEGORIA_SPESA           88852 non-null  object
 2   Altr_OC_COD_TIPO_AIUTO                88852 non-null  object
 3   Sogg_OC_DENOM_PROGRAMMATORE           88852 non-null  object
 4   Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO  88852 non-null  object
 5   New_prj_duration_pred                 88852 non-null  int64 
 6   New_DEN_REGIONE_new                   88852 non-null  object
 7   Cluster_Comune                        84408 non-null  object
dtypes: int64(1), object(7)
memory usage: 6.1+ MB


### Preprocessing

In [15]:
text_transformers = ColumnTransformer(
    transformers=[
        ('text1', TfidfVectorizer(), 'Anag_oc_sintesi_progetto'),
        ('text2', TfidfVectorizer(), 'Sogg_OC_DESCR_FORMA_GIU_BENEFICIARIO'),
        ('text3', TfidfVectorizer(), 'Sogg_OC_DENOM_PROGRAMMATORE')
    ],
    remainder='passthrough'  # Keep numeric and categorical data untouched
)

# Preprocessor for numeric and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ],
    remainder='drop'  # Drop columns not explicitly mentioned
)

# Combine preprocessing for numeric/categorical and text columns
final_preprocessor = ColumnTransformer(
    transformers=[
        ('num_cat', preprocessor, numeric_cols + categorical_cols),
        ('text', text_transformers, text_cols),
    ],
    remainder='drop'  # Drop other columns
)

In [16]:
y.info()

<class 'pandas.core.series.Series'>
Index: 88852 entries, 0 to 104375
Series name: New_Risk
Non-Null Count  Dtype   
--------------  -----   
88852 non-null  category
dtypes: category(1)
memory usage: 781.1 KB


In [17]:

# all X and y
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

In [18]:
y_train.info()

<class 'pandas.core.series.Series'>
Index: 62196 entries, 97842 to 28320
Series name: New_Risk
Non-Null Count  Dtype   
--------------  -----   
62196 non-null  category
dtypes: category(1)
memory usage: 546.8 KB


In [19]:
#X_train.head(50)

# Apply Esemble stacking Model with:
- weight balancing: function of frequences
- thresholds = {0: 0.65, 1: 0.3, 2: 0.3, 3: 0.45} -- selected after several test

In [22]:
import joblib
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

rf_model_path = r'C:\Users\gsepe\Capstone_Prj\data_output\Lav_1_work_3.3-RF_W_BM.pkl'
lr_model_path = r'C:\Users\gsepe\Capstone_Prj\data_output\Lav_1_work_3.3-LR_W_BM.pkl'


# Load the best estimators from pickled files
with open(rf_model_path, 'rb') as rf_file:
    best_rf_model = joblib.load(rf_file)

with open(lr_model_path, 'rb') as lr_file:
    best_lr_model = joblib.load(lr_file)

# Define thresholds for each class
thresholds = {0: 0.65, 1: 0.3, 2: 0.3, 3: 0.45}

# Compute class weights dynamically using compute_class_weight
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print(f"Computed Class Weights: {weights_dict}")

# Modify predictions based on thresholds
def apply_thresholds(probabilities, thresholds):

    adjusted_preds = np.zeros(probabilities.shape[0], dtype=int)
    for class_idx, threshold in thresholds.items():
        adjusted_preds[probabilities[:, class_idx] >= threshold] = class_idx
    return adjusted_preds

# Base models
base_models = [
    ('rf', best_rf_model),  
    ('lr', best_lr_model)   
]

# Meta-model 
meta_model = LogisticRegression(random_state=42, class_weight=weights_dict)

# Stacking classifier
stacking_classifier = StackingClassifier(
    estimators=base_models,        
    final_estimator=meta_model,    
    cv=5,                          
    n_jobs=-1                      
)

# Fit the stacking classifier
stacking_classifier.fit(X_train, y_train)

# Predict probabilities for the test set
probabilities = stacking_classifier.predict_proba(X_test)

# Apply thresholds to get adjusted class predictions
y_pred = apply_thresholds(probabilities, thresholds)

# Evaluate overall performance
accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')

# Evaluate per-class metrics
precision_per_class = precision_score(y_test, y_pred, average=None)
recall_per_class = recall_score(y_test, y_pred, average=None)
f1_per_class = f1_score(y_test, y_pred, average=None)

# Display the overall results
print("Stacking Classifier Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (macro): {f1_macro:.4f}\n")

# Display per-class results
print("Per-Class Metrics:")
for class_idx in range(len(precision_per_class)):
    print(f"Class {class_idx}:")
    print(f"  Precision: {precision_per_class[class_idx]:.4f}")
    print(f"  Recall:    {recall_per_class[class_idx]:.4f}")
    print(f"  F1-Score:  {f1_per_class[class_idx]:.4f}")
    print("-" * 30)

# Full classification report for reference
print("\nFull Classification Report:")
print(classification_report(y_test, y_pred))



Computed Class Weights: {0: 0.32178555907369466, 1: 4.613946587537092, 2: 5.367276492923714, 3: 2.043769716088328}
Stacking Classifier Results:
Accuracy: 0.6535
F1 Score (macro): 0.5103

Per-Class Metrics:
Class 0:
  Precision: 0.9680
  Recall:    0.6264
  F1-Score:  0.7606
------------------------------
Class 1:
  Precision: 0.2153
  Recall:    0.5308
  F1-Score:  0.3064
------------------------------
Class 2:
  Precision: 0.2463
  Recall:    0.7720
  F1-Score:  0.3735
------------------------------
Class 3:
  Precision: 0.4691
  Recall:    0.8350
  F1-Score:  0.6007
------------------------------

Full Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.63      0.76     20710
           1       0.22      0.53      0.31      1445
           2       0.25      0.77      0.37      1241
           3       0.47      0.83      0.60      3260

    accuracy                           0.65     26656
   macro avg       0.47      0.69      

In [26]:
ensemble_model_bm_path = r"data_output\Lav_1_work_3.3-Esemble_W_BM.pkl"
joblib.dump(stacking_classifier, ensemble_model_bm_path)

# Save the results (metrics) to a dictionary
results = {
    "accuracy": accuracy,
    "f1_macro": f1_macro,
    "precision_per_class": precision_per_class.tolist(),
    "recall_per_class": recall_per_class.tolist(),
    "f1_per_class": f1_per_class.tolist(),
    "classification_report": classification_report(y_test, y_pred, output_dict=True)
}

# Save results as a pickle file
results_path = r"data_output\Lav_1_work_3.3-Esemble_W.pkl"
with open(results_path, 'wb') as results_file:
    joblib.dump(results, results_file)