# Federico Campanozzi - Progetto Data Intensive con Relazione
                                                                                                             a.a. 2021/2022

## 1 - Descrizione del problema
Il problema è di classificazione e devo determinare il valore di una variabile binaria.
Significato dei dati :
- step: rappresenta un'unita di tempo 1 = 1 ora.
- type: tipo di transazione.
- amount: somma totatle di denaro spostato.
- nameOrig: codice del cliente che ha fatto la transazione.
- oldbalanceOrg: somma totale nel cc prima della transazione.
- newbalanceOrig: somma totale nel cc dopo la transazione.
- nameDest: beneficiario.
- oldbalanceDest: somma totale nel cc del beneficiario prima della transazione.
- newbalanceDest: somma totale nel cc del beneficiario dopo la transazione.
- isFraud: se è stata classificata come fraudolenta da un esperto nel settore.
- isFlaggedFraud: se è stata classificata come fraudolenta da un algoritmo di ML.

## 2 - Analisi esplorativa
Il dataset è molto vasto vasto quindi l'analisi esplorativa è stata condotta su un numero ristretto di record, giusto per capire
la dimensionalità del problema

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
transactions = pd.read_csv("../data/data.csv", nrows=20_000)

In [None]:
transactions[:10]

In [None]:
plt.figure(figsize=(16, 10))
transactions["type"].value_counts().plot.pie(ax=plt.subplot(1, 3, 1));
transactions["isFraud"].value_counts().plot.pie(ax=plt.subplot(1, 3, 2));
transactions[transactions["isFraud"] == 1]["type"].value_counts().plot.pie(ax=plt.subplot(1, 3,3));

## 3 - Modelli Predittivi

import delle librerie

In [94]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import plot_tree
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression


Suddivisione del training set e validation set con il metodo holdout

In [96]:
from sklearn.model_selection import train_test_split

f_numeric = ["amount","oldbalanceOrg","newbalanceOrig","oldbalanceDest","newbalanceDest"]
f_categoric = ["type"]

X = transactions[f_numeric+f_categoric]
y = transactions[["isFraud"]]

preproc = ColumnTransformer([
            ("numeric",StandardScaler(),f_numeric),
            ("categorical",OneHotEncoder(),f_categoric)
        ],remainder="drop")

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 1/3, random_state = 42)

definiamo alcune funzioni di utiliotà per la valutazione degli alberi di regressione 

In [None]:
def rmspe(y_real, y_pred):
    return np.sqrt(np.mean((y_pred /y_real - 1) ** 2))

In [None]:
def rmspe(y_real, y_pred):
    return np.sqrt(np.mean((y_pred /y_real - 1) ** 2))
def print_eval(X, y, model):
    y_pred = model.predict(X)
    print(f"MSE       : {mean_squared_error(y, y_pred):12.4f}")
    print(f"R-squared : {r2_score(y, y_pred):12.4f}")
    print(f"RMSPE     : {rmspe(y, y_pred):12.4f}")

In [None]:
def print_eval_2(X, y, model):
    y_pred = model.predict(X)    
    cm = confusion_matrix(y, y_pred)
    print("Confusion Matrix : ")
    print(cm)
    print(f"PRECISION  : {precision_score(y, y_pred):12.4f}")
    print(f"RECALL     : {recall_score(y, y_pred):12.4f}")
    print(f"F1-MEASURE : {f1_score(y, y_pred, average='macro'):12.4f}")

## 3.1 - Alberi di Regressione

### 3.1.1 - XGBRegressor

In [None]:
model_XGBoost = Pipeline([
    ("preproc", preproc),
    ("XGBoost", XGBClassifier(objective='reg:squarederror', 
                    learning_rate = 0.01577, 
                    reg_lambda = 0.008, 
                    reg_alpha = 0.0001, 
                    n_estimators = 1025, 
                    verbose_eval = False))
])

model_XGBoost.fit(X_train, y_train)
print(f"R^2 = {model_XGBoost.score(X_val, y_val)}")
print_eval_2(X_val, y_val,model_XGBoost)

In [None]:
pd.DataFrame(model_XGBoost.named_steps["XGBoost"].feature_importances_).T

In [None]:
model_XGBoost.named_steps["XGBoost"]

In [None]:
model_XGBoost = Pipeline([
    ("preproc", preproc),
    ("XGBoost", XGBRegressor(objective='reg:squarederror', 
                    learning_rate = 0.01577, 
                    reg_lambda = 0.008, 
                    reg_alpha = 0.0001, 
                    n_estimators = 1025, 
                    verbose_eval = False))
])

model_XGBoost.fit(X_train, y_train)
print(f"R^2 = {model_XGBoost.score(X_val, y_val)}")
print_eval(X_val, y_val,model_XGBoost)

In [None]:
model_XGBoost.named_steps["preproc"].transformers_[1][1].get_feature_names(f_categoric)

In [None]:
model_XGBoost.named_steps["preproc"].transformers_[0][1]

In [None]:
model_XGBoost.named_steps["preproc"].transformers_[1][1].get_feature_names(f_categoric)

## 3.1.2 Logistic Loss

In [107]:
model_RegLos = Pipeline([
    ("preproc", preproc),
    ("RegLoss", LogisticRegression(solver="saga", random_state=11,class_weight={1:45}))
])
model_RegLos.fit(X_train, y_train)
print(f"R^2 = {model_svm.score(X_val, y_val)}")
print_eval_2(X_val, y_val,model_RegLos)

  return f(*args, **kwargs)


R^2 = 0.9955002249887506
Confusion Matrix : 
[[6443  194]
 [  14   16]]
PRECISION  :       0.0762
RECALL     :       0.5333
F1-MEASURE :       0.5587


## 3.3 - SVM 

In [None]:
model_svm = Pipeline([
    ("preproc", preproc),
    ("SVM", SVC(kernel='rbf',random_state=42))
])
model_svm.fit(X_train, y_train)
print(f"R^2 = {model_svm.score(X_val, y_val)}")

## 3.4 - Neural Network

In [None]:
X = transactions[["type","amount","oldbalanceOrg","newbalanceOrig"]]
y = transactions[["isFraud"]]
X_train, x_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=42)

X_scaler = ColumnTransformer([
        ("numeric",StandardScaler(),["amount", "oldbalanceOrg", "newbalanceOrig"]),
        ("categorical",OneHotEncoder(),["type"])
    ],remainder="drop")

Y_scaler = StandardScaler()

X_train = X_scaler.fit_transform(X_train)
x_val = X_scaler.transform(x_val)
y_train = Y_scaler.fit_transform(y_train)
y_val = Y_scaler.transform(y_val.values)

model_nn = Sequential([
    Dense(8, activation="relu", input_dim=8),
    Dense(1)
])
model_nn.summary()
model_nn.compile(optimizer="adam",loss="mean_squared_error")
fit_history = model_nn.fit(X_train, y_train, batch_size=2, epochs=10)

In [None]:
plt.plot(fit_history.history["loss"], "ro-")
plt.legend(["Loss (Mean Square Error)"])
plt.xlabel("Epochs");

grazie alla classe KerasRegressor possiamo usare le potenzialità di Kerar unite al concetto di Pipeline, GridSearh ecc..
di scikit-lean

In [None]:
def build_nn(nodes, inpDim):
    model = Sequential([
        Dense(nodes, activation="relu", input_dim=inpDim),
        Dense(256, activation="relu"),
        Dense(128, activation="relu"),
        Dense(64, activation="relu"),
        Dense(1)
    ])
    model.compile(optimizer="adam", loss="mean_squared_error")
    model.summary()
    return model

In [None]:
X = transactions[["type","amount","oldbalanceOrg","newbalanceOrig"]]
y = transactions[["isFraud"]]

X_train, x_val, y_train, y_val = train_test_split(X, y, test_size=1/3, random_state=42)

model_nn = Pipeline([
    ("preproc", ColumnTransformer([
        ("numeric",StandardScaler(),["amount", "oldbalanceOrg", "newbalanceOrig"]),
        ("categorical",OneHotEncoder(),["type"])
    ],remainder="drop")),
    ("NN", KerasRegressor(build_fn=build_nn, nodes=32, inpDim=8, epochs=3, batch_size=1000))
])
model_nn.fit(X_train, y_train)
y_pred = model_nn.predict(x_val)
print(f"R^2 = {r2_score(y_val, y_pred)}")

## 4 - Valutazione
i modelli migliori si sono rilevati .... .
Su questi implementerò una gridsearch per la ricerca degli iperparametri migliori

In [None]:
from sklearn.model_selection import GridSearchCV, KFold
kf = KFold(3, shuffle=True, random_state=42)

In [None]:
grid = {
    "XGBoost__learning_rate": [0.01577,0.001577,0.01477,0.04577]
}

gs = GridSearchCV(model_XGBoost, grid, cv=kf)
gs.fit(X_train, y_train);

In [None]:
pd.DataFrame(gs.cv_results_).sort_values("rank_test_score")

In [None]:
grid = {
    "SVM__degree": [3, 4],
    "SVM__kernel": ['linear', 'poly', 'rbf']
}

gs = GridSearchCV(model_svm, grid, cv=kf)
gs.fit(X_train, y_train);

In [None]:
pd.DataFrame(gs.cv_results_).sort_values("rank_test_score")

## 5 - Conclusioni 
In conclusione le features più rilevanti sono ... 
Il modello migliore è
Gli iperparametri che non danno overfitting sono ...

## 6. Link alle risorse
#### Link al dataset di kaggle
https://www.kaggle.com/datasets/rupakroy/online-payments-fraud-detection-dataset