In [None]:
# imports
import numpy as np
from lib.ml_helper import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from graphdatascience import GraphDataScience

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
pd.set_option('display.max_columns',500)

## Machine Learning

### Preparation of the data for model building

#### Data loading

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/EY-Tech-Consulting-Denmark/Graphathon_2023-04-14/main/Data/clean_data/data.csv")
data.head()

In [None]:
data.shape

The dataset is on the claim level, while the labels are provided on the fraudulent level.  
The dataset is therefor aggregated and aggregated features are calculated.  

#### Data aggregation

In [None]:
# In order to create features that capture behaviour of providers, various ratios (amounts per claim) are calculated.
features = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'DaysAdmitted',
       'DaysClaimLasted', 'ClaimEndAfterDischarged', 'Age', 'TotalDiagnosis',
       'TotalProcedures', 'TotalPhysicians', 'Provider']
aggregated_data_means = data[features].groupby('Provider').mean().reset_index()

# Aggregating the data by the provider, getting the unique counts
aggregated_data_counts = data[['BeneID', 'ClaimID', 'Provider']].groupby('Provider').nunique().reset_index()

# Concatenating the two aggregations
aggregated_data = pd.merge(aggregated_data_means, aggregated_data_counts, on='Provider')
                                                                                                                      
# Adding the labels
aggregated_data = pd.merge(aggregated_data, data[['Provider', 'PotentialFraud']].groupby('Provider').min(), on='Provider')
                                                                                                                      
aggregated_data.head()

In [None]:
# Calculating average amount of claims per beneficiary
aggregated_data['claims_per_beneficiary'] = aggregated_data['ClaimID'] / aggregated_data['BeneID']
   
# renaming 'ClaimID' to 'Total_claims'
aggregated_data.rename(columns={'ClaimID': 'Total_claims'}, inplace=True)

features.extend(['Total_claims', 'claims_per_beneficiary'])
aggregated_data.head()

#### Features and target split

In [None]:
X = aggregated_data[features]                                           
y = aggregated_data['PotentialFraud']

#### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                test_size=0.3, 
                                                random_state=1,
                                                stratify=y)


### Model building

In [None]:
# Building a model
rf_classifier = RandomForestClassifier(random_state = 42, max_depth=4, min_samples_leaf=30, max_features=4)

In [None]:
# Feature selection
sfs = SFS(rf_classifier,
           direction="backward", 
           scoring='roc_auc', # what metrics to optimize when selecting the parameters
           cv=2)

sfs = sfs.fit(np.array( X_train.drop('Provider', axis=1)), y_train)

# selected columns
selected_columns = X_train.drop('Provider', axis=1).columns[list(sfs.support_)]
print(selected_columns)

In [None]:
# Calculating roc_auc using 5-folds cross validation
rf_classifier_roc_auc_scores = cross_validate(rf_classifier, X_train[selected_columns], y_train, metric='roc_auc')

In [None]:
# fitting of the model
fitted_rf_classifier = fit_model(rf_classifier, X_train[selected_columns], y_train)

In [None]:
get_features_importance(fitted_rf_classifier, X_train[selected_columns].columns)

In [None]:
# evaluation on the train set
plot_confusion_matrix(fitted_rf_classifier, X_train[selected_columns], y_train)

In [None]:
# evaluation on the test set
plot_confusion_matrix(fitted_rf_classifier, X_test[selected_columns], y_test)

## Model building utilizing graph features

In [None]:
# Connect to neo4j DBMS

#this is local DBMS via neo4j Desktop for testing purposes needs to be changed
DB_ULR = "bolt://localhost:7687" 
DB_USER = "neo4j"
DB_PASS = "1234"
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))

### Get graph features
Let's create a few features for Providers

### Using simple queries

In [None]:
# Flagging providers that have submitted the same claim twice
gds.run_cypher('''
    match (p:Provider)
    with p, size([(c2)-[:IS_SIMILAR_TO]-(c1:Claim)-[:SUBMITTED_BY]->(p) | 1] ) as  similar_claims 
    set p.similar_claims = similar_claims
''')

In [None]:
# Re use our claim communites (based on diagnosis codes)
gds.run_cypher(''' 
    match (p:Provider)<-[:SUBMITTED_BY]-(c)
    with p, count(distinct c.community_id) as number_of_claim_communities
    set p.number_of_claim_communities = number_of_claim_communities
''')

### Using graph algorithms

#### Pagerank

In [None]:
# Central providers in the network (based on sharing Physician on Beneficiary through claims)
g_pagerank, project_stats = gds.graph.project.cypher(
    'g_pagerank',
    ''' 
        match (n:Provider) return id(n) as id
    ''',
    '''
        match (p1:Provider)<-[:SUBMITTED_BY]-()-->(n:Physician|Beneficiary)<--()-[:SUBMITTED_BY]->(p2)
        where p1<>p2
        return id(p1) as source, id(p2) as target, count(distinct n) as weight
    '''
)
project_stats

In [None]:
# Pagerank (stats)
gds.pageRank.stats(g_pagerank, maxIterations=100)

In [None]:
# Pagerank (write)
gds.pageRank.write(g_pagerank, maxIterations=100, writeProperty='pagerank')

In [None]:
g_pagerank.drop()

#### Community (take 2)

In [None]:
g_provider_community, project_stats = gds.graph.project(
    'g_provider_community', 
    ['Provider', 'Claim', 'Diagnosis'], 
    [ 
        {
            'SUBMITTED_BY':          {'orientation': 'REVERSE'},
            'HAS_DIAGNOSIS_CODE_OF': {'orientation': 'NATURAL'},
            'HAS_GROUP_CODE_OF':     {'orientation': 'NATURAL'}
        }
    ]
)
project_stats

In [None]:
gds.beta.collapsePath.mutate(
    g_provider_community,
    pathTemplates=[['SUBMITTED_BY','HAS_DIAGNOSIS_CODE_OF'], ['SUBMITTED_BY','HAS_GROUP_CODE_OF']],
    mutateRelationshipType='PROVIDER_FOR_DIAGNOSIS'
)

In [None]:
gds.louvain.stats(
    g_provider_community,
    nodeLabels=['Provider','Diagnosis'],
    relationshipTypes=['PROVIDER_FOR_DIAGNOSIS']
)

In [None]:
gds.louvain.write(
    g_provider_community,
    nodeLabels=['Provider','Diagnosis'],
    relationshipTypes=['PROVIDER_FOR_DIAGNOSIS'],
    writeProperty='providerCommunityId'
)

In [None]:
g_provider_community.drop()

In [None]:
# Loading providers that submitted 2 similar claims
graph_features = gds.run_cypher('''
    match (p:Provider)
    return  p.id as Provider,
            p.similar_claims as similar_claims,
            p.number_of_claim_communities as number_of_claim_communities,
            p.providerCommunityId as providerCommunityId,
            p.pagerank as pagerank
''')

In [None]:
# adding the information to the train and test set
X_train_enhanced = pd.merge(X_train, graph_features, on ='Provider', how='left').fillna(0)
X_test_enhanced = pd.merge(X_test, graph_features, on ='Provider', how='left').fillna(0)

# fixing the type
#X_train_enhanced['similar_claims'] = X_train_enhanced['similar_claims'].astype(int)
#X_test_enhanced['similar_claims'] = X_test_enhanced['similar_claims'].astype(int)

### Model building

In [None]:
# Building a model
rf_classifier_enhanced = RandomForestClassifier(random_state = 42, max_depth=4, min_samples_leaf=30, max_features=4)

In [None]:
# Feature selection
sfs = SFS(rf_classifier,
           direction="backward", 
           scoring='roc_auc', # what metrics to optimize when selecting the parameters
           cv=2)

sfs_enhanced = sfs.fit(np.array( X_train_enhanced.drop('Provider', axis=1)), y_train)

# selected columns
selected_columns_enhanced = X_train_enhanced.drop('Provider', axis=1).columns[list(sfs.support_)]
print(selected_columns_enhanced)

In [None]:
# Calculating roc_auc using 5-folds cross validation
rf_classifier_enhanced_roc_auc_scores = cross_validate(rf_classifier_enhanced, X_train_enhanced.drop('Provider', axis=1), y_train, metric='roc_auc')

In [None]:
# fitting of the model
fitted_rf_classifier_enhanced = fit_model(rf_classifier_enhanced, X_train_enhanced[selected_columns_enhanced], y_train)

In [None]:
get_features_importance(fitted_rf_classifier_enhanced, X_train_enhanced[selected_columns_enhanced].columns)

In [None]:
# evaluation on the train set
plot_confusion_matrix(fitted_rf_classifier_enhanced, X_train_enhanced[selected_columns_enhanced], y_train)

In [None]:
# evaluation on the test set
plot_confusion_matrix(fitted_rf_classifier_enhanced, X_test_enhanced[selected_columns_enhanced], y_test)

### Models comparison

##### Train set comparison

In [None]:
plot_confusion_matrix_comparison(fitted_model_a=fitted_rf_classifier, fitted_model_b=fitted_rf_classifier_enhanced,
                                  X=X_train[selected_columns], X_enhanced=X_train_enhanced[selected_columns_enhanced], y=y_train)

##### Test set comparison

In [None]:
plot_confusion_matrix_comparison(fitted_model_a=fitted_rf_classifier, fitted_model_b=fitted_rf_classifier_enhanced,
                                  X=X_test[selected_columns], X_enhanced=X_test_enhanced[selected_columns_enhanced], y=y_test)

#### Cross validation comparison

In [None]:
compare_models(model_names=[fitted_rf_classifier, fitted_rf_classifier_enhanced],
               cv_results=[rf_classifier_roc_auc_scores, rf_classifier_enhanced_roc_auc_scores])