In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm

# Function to generate data with specified statistical measures
def generate_data(mean, std_dev, min_value, max_value, percentiles, percentile_values, num_samples=2500):
    # Generate initial data assuming a normal distribution
    data = np.random.normal(loc=mean, scale=std_dev, size=num_samples)

    # Define a function to adjust data to match specified percentiles
    def adjust_percentiles(data, percentiles, percentile_values):
        for i, pct in enumerate(percentiles):
            desired_value = percentile_values[i]
            current_value = np.percentile(data, pct)
            adjustment = desired_value - current_value
            data += adjustment * np.random.uniform(0.5, 1.5)  # Introduce variability with a random factor
        return data

    # Iteratively adjust data to match the specified percentiles
    for _ in range(2500):  # Number of iterations
        data = adjust_percentiles(data, percentiles, percentile_values)

    # Clip the data to ensure it falls within the specified min and max range
    data = np.clip(data, min_value, max_value)

    # Rescale data to match the specified mean and standard deviation
    data = (data - np.mean(data)) / np.std(data) * std_dev + mean

    # Ensure all values are non-negative
    data = np.clip(data, 0, None)

    return data

# Given statistical measures for different variables
variables = {
    'BAL_ACCT_MIN_REQD': (2144, 3811, 0, 30000, [25, 50, 75], [1000, 2000, 2000]),
    'N_TXN_AMT': (5362, 16947, 0, 995684, [25, 50, 75], [100, 600, 3000]),
    'BAL_LAST_STMNT': (18055.8, 58088, 0.0, 3116780, [25, 50, 75], [0, 1240, 16115]),
    'COUNT_TXN_PAST_3_DAYS_DEBIT': (73.2, 80, 0.0, 1275, [25, 50, 75], [29, 51, 89]),
    'COUNT_TXN_PAST_4_DAYS_CREDIT': (655, 1312, 0.0, 10663, [25, 50, 75], [72, 154, 628]),
    'SUM_TXN_PAST_6_DAYS_DEBIT': (1085580, 1022951, 0.0, 8943068, [25, 50, 75], [386282, 777815.8, 1451057]),
    'SUM_ATM_TXN_PAST_6_DAYS': (138943, 142317, 100.0, 1140000, [25, 50, 75], [20000.0, 90000.0, 220500.0]),
    'COUNT_ATM_TXN_PAST_7TO30DAYS': (2.947, 13.78, 0.0, 320.0, [25, 50, 75], [0.0, 0.0, 0.0]),
    'COUNT_TXN_PAST_HOURS': (39.8, 104.3, 1.0, 1666.0, [25, 50, 75], [5.0, 10.0, 25.0]),
    'SUM_TXN_PAST_HOURS': (69169, 109100, 1.0, 1194081, [25, 50, 75], [8000, 26009, 53000]),
    'COUNT_TXN_PAST_HOURS_DEBIT': (3.10, 4.21, 0.0, 64, [25, 50, 75], [0.0, 2.0, 4.0]),
    'SUM_TXN_PAST_HOURS_DEBIT': (44511.49, 62616, 1, 1194081, [25, 50, 75], [8000, 26009, 53000]),
    'COUNT_TXN_PAST_HOURS_CREDIT': (36, 104, 0.0, 1662, [25, 50, 75], [2.0, 6.0, 22.0]),
    'SUM_TXN_PAST_HOURS_CREDIT': (40416, 61516.99, 1.0, 1227024, [25, 50, 75], [7700, 21300, 48503]),
    'DIGIT_SUM': (4.73, 4.8, 0.0, 46, [25, 50, 75], [1.0, 3.0, 6.0]),
    'AVG_DIGIT_SUM': (4.6, 1.75, 1.0, 25.0, [25, 50, 75], [3.9, 4.63, 5.28]),
    'NUM_DIGIT': (3.41, 1.065, 1.0, 6, [25, 50, 75], [3.0, 3.0, 4.0]),
    'AVG_NUM_DIGIT': (3.41, 0.29, 1.0, 5.5, [25, 50, 75], [3.2, 3.4, 3.6]),
    'AGE_OF_ACCT': (5.23, 20.64, 0.0, 368, [25, 50, 75], [1.0, 1.0, 2.0]),
    'AGE_OF_CUSTOMER': (28, 11.3, 17.91, 224, [25, 50, 75], [21.58, 25.5, 32.166]),
    '7D_1D_CR_1T_PARTIES': (586, 1174, 0.0, 8541, [25, 50, 75], [53, 106, 446]),
    '14D_7D_CR_1T_PARTIES': (171, 708, 0.0, 8025, [25, 50, 75], [0.0, 5.0, 86.0]),
    '7D_1D_CR_5T_PARTIES': (19.3, 37.9, 0.0, 328, [25, 50, 75], [2.0,7.0,18.0])
}

# Generate data for each variable and create a DataFrame
data_dict = {}
for var_name, params in variables.items():
    mean, std_dev, min_value, max_value, percentiles, percentile_values = params
    data_dict[var_name] = generate_data(mean, std_dev, min_value, max_value, percentiles, percentile_values)

Mule_df_num = pd.DataFrame(data_dict)


import numpy as np
import pandas as pd

# Define the categories and their respective percentages for multiple variables
# Define the categories and their respective percentages
TYPE_OF_TXN = ['CREDIT', 'DEBIT']
percentages_VAR1 = [0.7, 0.30]

YR_OF_JOINING = ['2024', '2023', '2022','2021','2020']
percentages_VAR2 = [0.95, 0.02, 0.01, 0.01, 0.01]

CHEQ_ENABLED = ['N', 'Y']
percentages_VAR3 = [0.53, 0.47]

PASSBOOK = ['N', 'Y']
percentages_VAR4 = [0.14, 0.86]

BHIM_QR = ['N', 'Y']
percentages_VAR5 = [0.92, 0.08]

IB_REG = ['N', 'Y']
percentages_VAR6 = [0.48, 0.52]

DB_FLG = ['N', 'Y']
percentages_VAR7 = [0.07, 0.93]

VALID_MB = ['N', 'Y']
percentages_VAR8 = [0.06, 0.94]

AGE_OF_ACCOUNT= ['0', '1','2','3','4','5','6','7','REST']
percentages_VAR9 = [0.16, 0.42,0.19,0.04,0.04,0.04,0.04,0.04,0.03]

STATES=['Andhra Pradesh',
'Arunachal Pradesh',
'Assam',
'Bihar',
'Chhattisgarh',
'Goa',
'Gujarat',
'Haryana',
'Himachal Pradesh',
'Jharkhand',
'Karnataka',
'Kerala',
'Madhya Pradesh',
'Maharashtra',
'Manipur',
'Meghalaya',
'Mizoram',
'Nagaland',
'Odisha',
'Punjab',
'Rajasthan',
'Sikkim',
'Tamil Nadu',
'Telangana',
'Tripura',
'Uttar Pradesh',
'Uttarakhand',
'West Bengal']
percentages_VAR10 = [0.03,
0.03,
0.05,
0.3,
0.1,
0.01,
0.01,
0.01,
0.01,
0.1,
0.01,
0.01,
0.1,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0.01,
0,
0.1,
0.01,
0.01]

# Number of samples to generate
num_samples =2500

# Generate the categorical data based on the given percentages for each variable
data_var1 = np.random.choice(TYPE_OF_TXN, size=num_samples, p=percentages_VAR1)
data_var2 = np.random.choice(YR_OF_JOINING, size=num_samples, p=percentages_VAR2)
data_var3 = np.random.choice(CHEQ_ENABLED, size=num_samples, p=percentages_VAR3)
data_var4 = np.random.choice(PASSBOOK, size=num_samples, p=percentages_VAR4)
data_var5 = np.random.choice(BHIM_QR, size=num_samples, p=percentages_VAR5)
data_var6 = np.random.choice(IB_REG, size=num_samples, p=percentages_VAR6)
data_var7 = np.random.choice(DB_FLG, size=num_samples, p=percentages_VAR7)
data_var8 = np.random.choice(VALID_MB, size=num_samples, p=percentages_VAR8)
data_var9 = np.random.choice(AGE_OF_ACCOUNT, size=num_samples, p=percentages_VAR9)
data_var10 = np.random.choice(STATES, size=num_samples, p=percentages_VAR10)

# Create a DataFrame with the generated data
Mule_df_cat = pd.DataFrame({
    'TYPE_OF_TXN': data_var1,
    'YR_OF_JOINING': data_var2,
    'CHEQ_ENABLED': data_var3,
    'PASSBOOK': data_var4,
    'BHIM_QR': data_var5,
    'IB_REG': data_var6,
    'DB_FLG': data_var7,
    'VALID_MB': data_var8,
    'AGE_OF_ACCOUNT': data_var9,
    'STATES': data_var10,
})

# Concatenate the two DataFrames
Mule_merged_df = pd.concat([Mule_df_num, Mule_df_cat], axis=1)

# Add the CUSTID column with values like C1, C2, C3, ...
Mule_merged_df['CUSTID'] = ['C' + str(i) for i in range(1, len(Mule_merged_df) + 1)]
Mule_merged_df['MULE_STATUS']=1
# Reorder the columns to make CUSTID the first column
cols = Mule_merged_df.columns.tolist()
cols = ['CUSTID'] + ['MULE_STATUS']+[col for col in cols if col != 'CUSTID']
Mule_merged_df = Mule_merged_df[cols]


import numpy as np
import pandas as pd
from scipy.stats import norm

# Function to generate data with specified statistical measures
def generate_data(mean, std_dev, min_value, max_value, percentiles, percentile_values, num_samples=2500):
    # Generate initial data assuming a normal distribution
    data = np.random.normal(loc=mean, scale=std_dev, size=num_samples)

    # Define a function to adjust data to match specified percentiles
    def adjust_percentiles(data, percentiles, percentile_values):
        for i, pct in enumerate(percentiles):
            desired_value = percentile_values[i]
            current_value = np.percentile(data, pct)
            adjustment = desired_value - current_value
            data += adjustment * np.random.uniform(0.5, 1.5)  # Introduce variability with a random factor
        return data

    # Iteratively adjust data to match the specified percentiles
    for _ in range(2500):  # Number of iterations
        data = adjust_percentiles(data, percentiles, percentile_values)

    # Clip the data to ensure it falls within the specified min and max range
    data = np.clip(data, min_value, max_value)

    # Rescale data to match the specified mean and standard deviation
    data = (data - np.mean(data)) / np.std(data) * std_dev + mean

    # Ensure all values are non-negative
    data = np.clip(data, 0, None)

    return data

# Given statistical measures for different variables
variables = {
    'BAL_ACCT_MIN_REQD': (3479, 6304, 0, 100000, [25, 50, 75], [500, 2000, 2000]),
    'N_TXN_AMT': (21287.81, 2935629.0, 0, 1310500000, [25, 50, 75], [100, 500, 2000]),
    'BAL_LAST_STMNT': (103339.15, 2751994.0, 0.0, 546822700, [25, 50, 75], [6.76, 1535.2, 12503.23]),
    'COUNT_TXN_PAST_3_DAYS_DEBIT': (54.19, 99.0, 0.0, 2232, [25, 50, 75], [13.0, 25.0, 52.0]),
    'COUNT_TXN_PAST_4_DAYS_CREDIT': (254.19, 949.0, 0.0, 16957, [25, 50, 75], [14.0, 44.0, 134.0]),
    'SUM_TXN_PAST_6_DAYS_DEBIT': (202100.0, 519759.5, 0.94, 1310500000, [25, 50, 75], [54039.0, 162045.0, 528246.8]),
    'SUM_ATM_TXN_PAST_6_DAYS': (44260.0, 49371.3, 100.0, 1120000.0, [25, 50, 75], [6000.0, 20000.0, 56000.0]),
    'COUNT_ATM_TXN_PAST_7TO30DAYS': (3.99, 12.661, 0.0, 320.0, [25, 50, 75], [0.0, 1.0, 3.0]),
    'COUNT_TXN_PAST_HOURS': (15.34, 96.64, 1.0, 3080.0, [25, 50, 75], [2.0, 4.0, 8.0]),
    'SUM_TXN_PAST_HOURS': (64141.96, 3029704.0, 0.02, 1310500000, [25, 50, 75], [900.0, 5000.0, 21000.0]),
    'COUNT_TXN_PAST_HOURS_DEBIT': (3.49, 17.54, 0.0, 792, [25, 50, 75], [0.0, 1.0, 3.0]),
    'SUM_TXN_PAST_HOURS_DEBIT': (56452.86, 3639132.0, 0.02, 1310500000, [25, 50, 75], [500.0, 3936.0, 16000]),
    'COUNT_TXN_PAST_HOURS_CREDIT': (11.49, 17.54, 0.0, 792, [25, 50, 75], [0.0, 1.0, 3.0]),
    'SUM_TXN_PAST_HOURS_CREDIT': (36197.41, 723102.1, 1.0, 1350500000, [25, 50, 75], [1000.0, 4600.0, 16030]),
    'DIGIT_SUM': (6.03, 5.11, 0.0, 57, [25, 50, 75], [2.0, 5.0, 8.0]),
    'AVG_DIGIT_SUM': (5.9, 2.0, 1.0, 31.0, [25, 50, 75], [4.8, 5.59, 6.68]),
    'NUM_DIGIT': (3.26, 1.023, 1.0, 10, [25, 50, 75], [3.0, 3.0, 4.0]),
    'AVG_NUM_DIGIT': (3.26, 0.49, 1.0, 7.05, [25, 50, 75], [2.94, 3.23, 3.6]),
    'AGE_OF_ACCT': (13.79, 39.80, 0.0, 1334, [25, 50, 75], [4.0, 5.0, 7.0]),
    'AGE_OF_CUSTOMER': (32.72, 20.94, 3.33, 224.91, [25, 50, 75], [23.33, 29.0, 35.166]),
    '7D_1D_CR_1T_PARTIES': (134.8, 342.17, 0.0, 1855, [25, 50, 75], [9.0, 28.0, 76.0]),
    '14D_7D_CR_1T_PARTIES': (181.73, 558.46, 0.0, 3564.0, [25, 50, 75], [8.0, 26.0, 83.0]),
    '7D_1D_CR_5T_PARTIES': (4.5, 9.18, 0.0, 88.0, [25, 50, 75], [0.0, 1.0, 5.0])
}

# Generate data for each variable and create a DataFrame
data_dict = {}
for var_name, params in variables.items():
    mean, std_dev, min_value, max_value, percentiles, percentile_values = params
    data_dict[var_name] = generate_data(mean, std_dev, min_value, max_value, percentiles, percentile_values)

non_mule_df_num = pd.DataFrame(data_dict)

import numpy as np
import pandas as pd

# Define the categories and their respective percentages for multiple variables
# Define the categories and their respective percentages
TYPE_OF_TXN = ['CREDIT', 'DEBIT']
percentages_VAR1 = [0.53, 0.47]

YR_OF_JOINING = ['2024', '2023', '2022','2021','2020']
percentages_VAR2 = [0.81, 0.1, 0.05, 0.03, 0.01]

CHEQ_ENABLED = ['N', 'Y']
percentages_VAR3 = [0.39, 0.61]

PASSBOOK = ['N', 'Y']
percentages_VAR4 = [0.2, 0.8]

BHIM_QR = ['N', 'Y']
percentages_VAR5 = [0.78, 0.22]

IB_REG = ['N', 'Y']
percentages_VAR6 = [0.73, 0.27]

DB_FLG = ['N', 'Y']
percentages_VAR7 = [0.04, 0.96]

VALID_MB = ['N', 'Y']
percentages_VAR8 = [0.02, 0.98]

AGE_OF_ACCOUNT= ['0', '1','2','3','4','5','6','7','REST']
percentages_VAR9 = [0.0, 0.03,0.04,0.07,0.16,0.22,0.20,0.10,0.18]

STATES=['Andhra Pradesh',
'Arunachal Pradesh',
'Assam',
'Bihar',
'Chhattisgarh',
'Goa',
'Gujarat',
'Haryana',
'Himachal Pradesh',
'Jharkhand',
'Karnataka',
'Kerala',
'Madhya Pradesh',
'Maharashtra',
'Manipur',
'Meghalaya',
'Mizoram',
'Nagaland',
'Odisha',
'Punjab',
'Rajasthan',
'Sikkim',
'Tamil Nadu',
'Telangana',
'Tripura',
'Uttar Pradesh',
'Uttarakhand',
'West Bengal']
percentages_VAR10 = [0.035,
0.035,
0.035,
0.04,
0.04,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.035,
0.04,
0.035,
0.035,
0.035,
0.035,
0.04,
0.035,
0.035]

# Number of samples to generate
num_samples =2500

# Generate the categorical data based on the given percentages for each variable
data_var1 = np.random.choice(TYPE_OF_TXN, size=num_samples, p=percentages_VAR1)
data_var2 = np.random.choice(YR_OF_JOINING, size=num_samples, p=percentages_VAR2)
data_var3 = np.random.choice(CHEQ_ENABLED, size=num_samples, p=percentages_VAR3)
data_var4 = np.random.choice(PASSBOOK, size=num_samples, p=percentages_VAR4)
data_var5 = np.random.choice(BHIM_QR, size=num_samples, p=percentages_VAR5)
data_var6 = np.random.choice(IB_REG, size=num_samples, p=percentages_VAR6)
data_var7 = np.random.choice(DB_FLG, size=num_samples, p=percentages_VAR7)
data_var8 = np.random.choice(VALID_MB, size=num_samples, p=percentages_VAR8)
data_var9 = np.random.choice(AGE_OF_ACCOUNT, size=num_samples, p=percentages_VAR9)
data_var10 = np.random.choice(STATES, size=num_samples, p=percentages_VAR10)

# Create a DataFrame with the generated data
NonMule_df_cat = pd.DataFrame({
    'TYPE_OF_TXN': data_var1,
    'YR_OF_JOINING': data_var2,
    'CHEQ_ENABLED': data_var3,
    'PASSBOOK': data_var4,
    'BHIM_QR': data_var5,
    'IB_REG': data_var6,
    'DB_FLG': data_var7,
    'VALID_MB': data_var8,
    'AGE_OF_ACCOUNT': data_var9,
    'STATES': data_var10,
})

# Concatenate the two DataFrames
NON_Mule_merged_df = pd.concat([non_mule_df_num, NonMule_df_cat], axis=1)

# Add the CUSTID column with values like C1, C2, C3, ...
NON_Mule_merged_df['CUSTID'] = ['CN' + str(i) for i in range(1, len(NON_Mule_merged_df) + 1)]
NON_Mule_merged_df['MULE_STATUS'] =0
# Reorder the columns to make CUSTID the first column
cols = NON_Mule_merged_df.columns.tolist()
cols = ['CUSTID'] +['MULE_STATUS']+ [col for col in cols if col != 'CUSTID']
NON_Mule_merged_df = NON_Mule_merged_df[cols]

# Concatenate the two DataFrames after resetting their index to avoid column name ambiguity
merged_df = pd.concat([Mule_merged_df.reset_index(drop=True),
                       NON_Mule_merged_df.reset_index(drop=True)],
                      axis=0, ignore_index=False)


# Shuffle the rows of the DataFrame
merged_df = merged_df.sample(frac=1).reset_index(drop=True) # frac=1 means return all rows in random order

# Import necessary libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd

# Concatenate the two DataFrames after resetting their index to avoid column name ambiguity
merged_df = pd.concat([Mule_merged_df.reset_index(drop=True),
                       NON_Mule_merged_df.reset_index(drop=True)],
                      axis=0, ignore_index=False)


# Shuffle the rows of the DataFrame
merged_df = merged_df.sample(frac=1).reset_index(drop=True) # frac=1 means return all rows in random order


X=merged_df.drop(['CUSTID','MULE_STATUS'],axis=1)
y=merged_df.iloc[:,-1]      

merged_df_encoded = pd.get_dummies(X, columns=['TYPE_OF_TXN', 'YR_OF_JOINING', 'CHEQ_ENABLED', 'PASSBOOK', 'BHIM_QR',
       'IB_REG', 'DB_FLG', 'VALID_MB', 'AGE_OF_ACCOUNT', 'STATES'])
# Assuming df_num is the DataFrame we created earlier
# Split the dataset into train and test samples (80-20 split)
#train_df, test_df = train_test_split(merged_df_encoded, test_size=0.2, random_state=42)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(merged_df_encoded, y, test_size=0.2, random_state=42)
# train_df, test_df = train_test_split(merged_df, test_size=0.2, stratify=merged_df[target], random_state=42)


from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd

# Assume X_train, y_train, X_test, y_test are already defined

# Initialize the models
models = {
    'XGBoost': XGBClassifier(),
    'SVM': SVC(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    print(f'{name} Model:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'Confusion Matrix:\n{conf_matrix}')
    print('-----------------------------------')


# Train and evaluate models
best_model = None
best_f1_score = 0
model_scores = {}
best_model_instance = None

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='binary')
    model_scores[model_name] = f1
    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = model_name
        best_model_instance = model

# Print the best model and its F1 score
print(f"Best Model: {best_model} with F1 Score: {best_f1_score}")

# Save the best model details in a .yaml file
model_details = {
    'best_model': best_model,
    'best_f1_score': best_f1_score,
    'model_scores': model_scores
}

#import yaml
import pickle

#with open('best_ML_model_classification.yaml', 'w') as file:
#    yaml.dump(model_details, file)

# Save the best model as a pickle file
with open('best_model_ML_Classification.pkl', 'wb') as file:
    pickle.dump(best_model_instance, file)


XGBoost Model:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
[[486   0]
 [  0 514]]
-----------------------------------
SVM Model:
Accuracy: 0.804
Precision: 0.740909090909091
Recall: 0.9513618677042801
F1 Score: 0.8330494037478705
Confusion Matrix:
[[315 171]
 [ 25 489]]
-----------------------------------
Gradient Boosting Model:
Accuracy: 0.999
Precision: 0.9980582524271845
Recall: 1.0
F1 Score: 0.9990281827016522
Confusion Matrix:
[[485   1]
 [  0 514]]
-----------------------------------
Decision Tree Model:
Accuracy: 0.999
Precision: 0.9980582524271845
Recall: 1.0
F1 Score: 0.9990281827016522
Confusion Matrix:
[[485   1]
 [  0 514]]
-----------------------------------
Random Forest Model:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Confusion Matrix:
[[486   0]
 [  0 514]]
-----------------------------------
Best Model: XGBoost with F1 Score: 1.0


ModuleNotFoundError: No module named 'yaml'