In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras
from tensorflow.keras import Input
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Dense, Flatten, BatchNormalization, LSTM
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, fbeta_score, accuracy_score, precision_recall_curve, auc
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import shap
import gc
import json
import pickle
import random

names = ['Attività e Passività', 'Costi e Ricavi', 'Entrate', 'Spese']
years = np.arange(2008, 2016)

# Functions

In [None]:
def create_data(dataframes, comuni, pop, category, features, lag, pred, mode, add = None):
	years = sorted(set(int(col.split('_')[1]) for col in dataframes.keys()))
	year_ranges = [np.arange(year, year + lag) for year in np.arange(years[0], years[-1] - lag + 2)]

	data = []
	y = []

	for year_range in year_ranges:
		data_temp = []
		for year in year_range:
			pop_temp = pop[f'pop_{year}'].values[:, np.newaxis]
			if mode == 'pop':
				temp = pd.concat([dataframes[f'{feature}_{year}_{category}'] for feature in features], axis=1) / pop_temp
			elif mode == 'abs':
				temp = pd.concat([dataframes[f'{feature}_{year}_{category}']
				                  .div(dataframes[f'{feature}_{year}_{category}'].sum(axis=1), axis=0).fillna(0)
				                  for feature in features], axis=1)
			if add is not None:
				for elem in add:
					if elem == 'pop':
						col = np.log(pop[f'pop_{year}'].copy())
						temp = pd.concat([temp, col], axis = 1)
					if elem == 'zone':
						for elem in zone.columns:
							col = zone[elem].copy()
							temp = pd.concat([temp, col], axis = 1)
			data_temp.append(temp)

		temp = np.stack([elem.values for elem in data_temp])
		temp = np.swapaxes(temp, 0, 1)
		data.append(temp)
		temp_y = sum([comuni[f'Target {year_range[-1] + i + 1}'].values for i in range(pred)])
		y.append((temp_y > 0).astype(int))

	X = np.concatenate(data)
	y = np.concatenate(y)
	print(X.shape)
				
	return X, y

In [None]:
def run_training(dataframes, comuni, pop, category, features, lag, pred, mode, epochs, batch_size, verbose, add):
	X, y = create_data(dataframes, comuni, pop, category, features, lag, pred, mode, add)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
	
	class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
	class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
	class_sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

	cnn_model = train_model(X_train, y_train, X_test, y_test, epochs, batch_size, class_weight_dict, verbose)
	cnn_metrics, cnn_predictions_test, cnn_predictions_train = evaluate_model(cnn_model, X_train, y_train, X_test, y_test, verbose)
	
	X_reshaped = X.reshape(X.shape[0], -1)[:,:-lag+1]
	X_train_reshaped, X_test_reshaped = train_test_split(X_reshaped, test_size=0.2, stratify=y, random_state=42)
	
	models = {
		"Logistic Regression": LogisticRegression(penalty = 'l2', C = 5, solver='lbfgs', class_weight='balanced',max_iter = 100000, random_state=42),
		"Decision Tree": DecisionTreeClassifier(max_depth=None, random_state=42, class_weight='balanced'),
		"Random Forest": RandomForestClassifier(n_estimators=500, random_state=42, class_weight='balanced'),
		"XGBoost": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
	}
	
	model_objects = {"CNN": cnn_model}
	results = {"CNN": cnn_metrics}
	
	for model_name, model in models.items():
		if model_name == "Logistic Regression":
			model.set_params(class_weight='balanced')  # Direct class weight support
		elif model_name in ["Decision Tree", "Random Forest"]:
			model.set_params(class_weight=class_weight_dict)  # Pass computed class weights
		model.fit(X_train_reshaped, y_train, sample_weight=class_sample_weight if model_name not in ["Logistic Regression"] else None)
		model_objects[model_name] = model
		
		predictions_test = model.predict(X_test_reshaped)
		predictions_train = model.predict(X_train_reshaped)
		probabilities_test = model.predict_proba(X_test_reshaped)[:, 1] if hasattr(model, "predict_proba") else predictions_test
		probabilities_train = model.predict_proba(X_train_reshaped)[:, 1] if hasattr(model, "predict_proba") else predictions_train
		
		roc_auc_test = roc_auc_score(y_test, probabilities_test)
		roc_auc_train = roc_auc_score(y_train, probabilities_train)
		fpr_test, tpr_test, _ = roc_curve(y_test, probabilities_test)
		fpr_train, tpr_train, _ = roc_curve(y_train, probabilities_train)
		precision_test, recall_test, _ = precision_recall_curve(y_test, probabilities_test)
		pr_auc_test = auc(recall_test, precision_test)
		precision_train, recall_train, _ = precision_recall_curve(y_train, probabilities_train)
		pr_auc_train = auc(recall_train, precision_train)
		
		metrics = {
			'precision_test': precision_score(y_test, predictions_test),
			'recall_test': recall_score(y_test, predictions_test),
			'f1_test': f1_score(y_test, predictions_test),
			'confusion_matrix_test': confusion_matrix(y_test, predictions_test),
			'precision_train': precision_score(y_train, predictions_train),
			'recall_train': recall_score(y_train, predictions_train),
			'f1_train': f1_score(y_train, predictions_train),
			'confusion_matrix_train': confusion_matrix(y_train, predictions_train),
			'roc_auc_test': roc_auc_test,
			'roc_auc_train': roc_auc_train,
			'roc_curve_test': (fpr_test, tpr_test),
			'roc_curve_train': (fpr_train, tpr_train),
			'pr_auc_test': pr_auc_test,
			'pr_auc_train': pr_auc_train,
			'pr_curve_test': (precision_test, recall_test),
			'pr_curve_train': (precision_train, recall_train)
		}
		results[model_name] = metrics
	
	return model_objects, results

# Load Data

In [None]:
columns = ['Denominazione', 'Dizione_Provincia', 'Dizione_Regione', 'Dizione_zona']

final = pd.read_csv('Data//Anagrafe_comuni.csv', sep = ';', encoding = 'latin_1', low_memory = False).set_index('Id_Ente')

final['Data_Istituzione'] = pd.to_datetime(final['Data_Istituzione'], format='%Y-%m-%d')
final = final[final['Data_Istituzione'].dt.year < 2008]

final = final[final['Data_Cessazione'].isna()]

final = final[final['Codice_Tipologia_DLGS_118_2011'] == 'ELCOMU'][columns]

final = final.rename(columns={'Denominazione': 'Comune', 'Dizione_Provincia': 'Provincia', 'Dizione_Regione': 'Regione', 'Dizione_zona': 'Zona'})
final.index.names = ['BDAP']

final

In [None]:
names = ['Attività e Passività', 'Costi e Ricavi', 'Entrate', 'Spese']
years = np.arange(2008, 2016)
dataframes = {}

for name in names:
	
    for year in years:
		
        for category in ["CAT I", "CAT II"]:
			
            file_path = f'dati final//{name} {year} {category}.pkl'
            df = pd.read_pickle(file_path)  
            dataframes[f'{name}_{year}_{category}'] = df

In [None]:
comuni = pd.read_pickle('data//comuni.pkl')
zone = pd.read_pickle('data//zona.pkl')
pop = pd.read_pickle('data//pop.pkl')
critici = pd.read_csv('data//CriticitàComuni.csv', sep = ';', low_memory = False)
for year in np.arange(2008, 2020):
	comuni[f'Target {year}'] = 0
for year in np.arange(2008, 2020):
	for idx, row in critici.iterrows():
		if int(row['Anno']) == year:
			comune = row['Comune']
			comune_2 = comune.replace('-', ' ')
			comuni.loc[(comuni['Comune'] == f'COMUNE DI {comune}') | (comuni['Comune'] == f'COMUNE DI {comune_2}'), f'Target {year}'] = 1

# Training

## CAT I

In [None]:
category = 'CAT I'
features = names[2:4]
epochs = 500
batch_size = 1024
verbose = 0
mode = 'pop'
results = {}
models = {}
add = ['pop', 'zone']

for pred in range(1,4):
	print(f'#####PRED: {pred}#####')
	temp = {}
	model = {}
	for lag in range(5,9):
		model[lag], temp[lag] = run_training(dataframes, comuni, pop, category, features, lag, pred, mode, epochs, batch_size, verbose, add)
		print(lag)
	results[pred] = temp
	models[pred] = model
	
if save:	
	with open('results_CAT I_final.pickle', 'wb') as handle:
	    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
	with open('models_CAT I_final.pickle', 'wb') as handle:
	    pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

## CAT II

In [None]:
category = 'CAT II'
features = names[2:4]
epochs = 500
batch_size = 1024
verbose = 0
mode = 'pop'
results = {}
models = {}
add = ['pop', 'zone']

for pred in range(1,4):
	print(f'#####PRED: {pred}#####')
	temp = {}
	model = {}
	for lag in range(5,9):
		model[lag], temp[lag] = run_training(dataframes, comuni, pop, category, features, lag, pred, mode, epochs, batch_size, verbose, add)
		print(lag)
	results[pred] = temp
	models[pred] = model
	
if save:	
	with open('results_CAT II_final.pickle', 'wb') as handle:
	    pickle.dump(results, handle, protocol=pickle.HIGHEST_PROTOCOL)
	with open('models_CAT II_final.pickle', 'wb') as handle:
	    pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)