# Issues (task) covered in this feature notebook :

*  explore data a bit w/modules, functions and plots
*  baseline model test
    - Logistic regression or Dummy Classifier
    - or Clustering algorithm
    - or treebased
* base metrics 
    - precision_recall
    - f1_score
    - classification report
    - confusion matrix
    - area under the curve

* train_test_split


[Project Issue Link](https://github.com/users/Cazta/projects/1/views/2?pane=issue&itemId=32899160)



# Importing Packages

In [None]:
# import necessary libraries

import pandas as pd
import numpy as np
import missingno as msno 
import seaborn as sns
import matplotlib.pyplot as plt 


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, fbeta_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, balanced_accuracy_score


import warnings
warnings.filterwarnings('ignore')

# Global Variables

In [None]:
# global variables

# for reproducibility sake
RSEED = 42

# data path to load version 1 of the processed fraud data
data_path = 'data/fraud_data_processed_V1.csv'




# Loading Data

In [None]:
# load, get information on dataset and display descriptive summary



def get_data_summary(data_path = None, data= None, desc_sm = False, no_unq = False, *args, **kwargs): 

    if (data is None) and (data_path is None):
        raise ValueError('''Either enther a data path or a dataset (dataframe)
                    
                        'data' : a dataset (dataframe)
                        'datapath' : a data path used to load a csv data file
                    
                        ''') 

    elif (data is None) and (data_path is not None):
        data = pd.read_csv( data_path) 
    else:
        data = data


    print (f"Dataset shape: {data.shape}") 

    print('_____'*10)

    print(f''' 
    Number of observations : {data.shape[0]}
    Number of features : {data.shape[1]}
        ''')

    print('_____'*10)

    print ("Dataset sample: ") 
    print('_____'*10)

    display(data.head())

    print('_____'*10)

    if desc_sm:
        print ("Dataset descriptive summary: ") 
        print('_____'*10)

        display(data.describe().T.style.format('{:.2f}'))

    print('_____'*10)

    if no_unq:
        print ("Unique values/classes for dataset features: ") 
        print('_____'*10)

        display(data.nunique())


    return data 

# load version 1 of the processed fraud data

df = get_data_summary(data_path = data_path, desc_sm = True, no_unq = True)



---

# Brief Exploration

* Stakeholder : ZINDI, STEG (Tunisian Company of Electricity and Gas)
* Business objective : To use client's billing history for detecting which clients are fraudulently manipulating their energy (electricity and gas) meters
* open questions : 
    - What does the consumption  (consumation?) levels tell us about fraudelent activities? 
    - What does the recorded reading remark  tell us about fraudelent activities? 
    - etc

* hypothesis statemnets :


* aggregating some data to gain insight
* basic plots to investigate the data visually
    - countplots
    - 

    

In [None]:
# grouping by client id's using the aggregate : count

df_clients = df.groupby('client_id').count().reset_index()

# curious grouped dataframe summary : by client's id count

df_clients_sm = get_data_summary(data = df_clients)




In [None]:

# countplots of specific features by target

cat_features = ['disrict', 'client_catg', 'region',  'tarif_type', 
                    'counter_statue', 'reading_remarque', 'counter_coefficient']

for i in cat_features:
    fig = plt.figure(figsize=(12,5))
    sns.countplot(data=df, x=df[i], hue=df["target"])
    plt.show();

In [None]:
# grouby reading remark for dataset with specific features

feat_1 =  ['target', 'counter_number', 'counter_statue', 'counter_code', 
        'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
        'consommation_level_3', 'consommation_level_4', 'old_index',
        'new_index']


read_mark = df.groupby("reading_remarque")[feat_1].count().reset_index()
read_mark

In [None]:
feat_2 =  ['reading_remarque', 'counter_number', 'counter_statue', 'counter_code', 
        'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
        'consommation_level_3', 'consommation_level_4', 'old_index',
        'new_index']


target_count = df.groupby("target")[feat_2].count().reset_index()
target_count

In [None]:
# consumation levels, counter details and target

cons_count =  ['target', 'counter_number',
        'counter_statue', 'counter_code', 'reading_remarque',
        'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
        'consommation_level_3', 'consommation_level_4', 'old_index',
        'new_index']

# date related feature variables

date_features = ['target', 'creation_date',  'invoice_date',  'months_number']


# Baseline Modelling

* state baseline features
* define x (input data) and y (target)
* train_test_split 
* run baseline model : logistic reg?  sgd? knn? 


# Evaluate the baseline model

* get evaluation scores using features from the initial data
    - Area under the curve score (roc_auc_score)
    - precision score
    - recall score
    - accuracy score
    - balanced accuracy score
    - confusion matrix
    - classification report


In [None]:
# feature variables for version 1 (df) of processed fraud data for baseline model 

base_features = ['disrict', 'client_catg', 'region',  
                'tarif_type', 'counter_number','counter_statue', 
                'counter_code', 'reading_remarque','counter_coefficient', 
                'consommation_level_1', 'consommation_level_2', 'consommation_level_3', 
                'consommation_level_4', 'old_index', 'new_index']


In [None]:
# defining the base x (input data) features and y (target) feature

x_base = df[base_features]
y = df["target"]


print(f"shape of baseline input data: {x_base.shape}")
print(f"shape of target data: {y.shape}")



In [None]:
# train test split for base


x_train, x_test, y_train, y_test = train_test_split(x_base, y, test_size=0.25, random_state=RSEED)




In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

y_pred_lr = log_reg.predict(x_test)

In [None]:
# evaluation metrics : confusion matrix,, accuracy, balance accuracy, classification report

def eval_metrics(y_test, y_pred): 
    """
    Summary:
        Function to calculate the accuracy and balanced accuracy score for imbalanced data, get the confusion 
        matrix as well as the classification report of the ML 
        model based on the predictions and true target values for the test set.

    Args:
        y_test (numpy.ndarray): test target data
        y_pred (numpy.ndarray): predictions based on test data
    """    
    
    print("-----"*15)
    print(f'''Confusion Matrix: 
    {confusion_matrix(y_test, y_pred)} ''') 
    
    print("-----"*15)
    print (f''' Accuracy : 
    {(accuracy_score(y_test, y_pred).round(2)) * 100} ''')

    print("-----"*15)
    print (f''' Balanced Accuracy : 
    {(balanced_accuracy_score(y_test, y_pred).round(2)) * 100} ''')
    
    print("-----"*15)
    print(f'''Report :  
    {classification_report(y_test, y_pred)} ''') 



eval_metrics(y_test, y_pred_lr)

In [None]:
# eval scoring metrics : recall, precisoon, f1_score, roc_auc_score, fpr, tpr


def evaluate_model(predictions, probs, train_predictions, train_probs):
    """Compare machine learning model to baseline performance.
    Computes statistics and shows ROC curve."""
    
    baseline = {}
    
    baseline['recall'] = recall_score(y_test, [1 for _ in range(len(y_test))])
    baseline['precision'] = precision_score(y_test, [1 for _ in range(len(y_test))])
    baseline['f1_score'] = f1_score(y_test, [1 for _ in range(len(y_test))])
    baseline['roc'] = 0.5
    
    results = {}
    
    results['recall'] = recall_score(y_test, predictions)
    results['precision'] = precision_score(y_test, predictions)
    results['f1_score'] = f1_score(y_test, predictions)
    results['roc'] = roc_auc_score(y_test, probs)
    
    # train_results = {}
    # train_results['recall'] = recall_score(y_test, train_predictions)
    # train_results['precision'] = precision_score(y_test, train_predictions)
    # train_results['f1_score'] = f1_score(y_test, predictions)
    # train_results['roc'] = roc_auc_score(y_test, train_probs)
    
    for metric in ['recall', 'precision', 'f1_score', 'roc']:
        #print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} Train: {round(train_results[metric], 2)}')
        print(f'{metric.capitalize()} Baseline: {round(baseline[metric], 2)} Test: {round(results[metric], 2)} ')

    # Calculate false positive rates and true positive rates
    base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
    model_fpr, model_tpr, _ = roc_curve(y_test, probs)

    plt.figure(figsize = (8, 6))
    plt.rcParams['font.size'] = 16
    
    # Plot both curves
    plt.plot(base_fpr, base_tpr, 'b', label = 'baseline')
    plt.plot(model_fpr, model_tpr, 'r', label = 'model')
    plt.legend();
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves');

In [None]:
# logistic regression model 

train_probs_lr = log_reg.predict_proba(x_train)[:, 1]
test_probs_lr = log_reg.predict_proba(x_test)[:, 1]

train_preds_lr = log_reg.predict(x_train)
test_preds_lr = log_reg.predict(x_test)

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_lr)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, test_probs_lr)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')

In [None]:


# --!
evaluate_model(test_preds_lr, test_probs_lr, train_preds_lr, train_probs_lr)



In [None]:
# K-neighbours classifier as baseline?


# initialize and fit/train model on data

knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(x_train, np.ravel(y_train))

# predict on test

y_pred_knn = knn.predict(x_test)





In [None]:
# Make probability predictions
train_probs_knn = knn.predict_proba(x_train)[:, 1]
test_probs_knn = knn.predict_proba(x_test)[:, 1]

train_preds_knn = knn.predict(x_train)
test_preds_knn = knn.predict(x_test)

print(f'Train ROC AUC Score: {roc_auc_score(y_train, train_probs_knn)}')
print(f'Test ROC AUC  Score: {roc_auc_score(y_test, test_probs_knn)}')
print(f'Baseline ROC AUC: {roc_auc_score(y_test, [1 for _ in range(len(y_test))])}')


In [None]:
eval_metrics(y_test, y_pred_knn)



In [None]:

evaluate_model(test_preds_knn, test_probs_knn, train_preds_knn, train_probs_knn)


In [21]:
# sgdclassifier as baseline?

# Fit and evaluate model without hyperparameter tuning using cross validation and unscaled data 
sgd_classifier = SGDClassifier(random_state=RSEED)
scores = cross_val_score(sgd_classifier, x_train, y_train, cv=5, n_jobs=-1)

# Evaluation 
print('Score (unscaled):', round(scores.mean(), 4))


In [None]:
# save current version of processed data for use later

# df_processed.to_csv('data/fraud_data_processed_V1.csv', index=False)

