## ```Imports```
---

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import pickle

In [107]:
fraud_df = pd.read_csv('/Users/crivera/Desktop/capstone-dsir82/capstone_data/final_model.csv')
fraud_df = fraud_df[fraud_df['state']=='NY']

In [109]:
fraud = fraud_df.copy()
fraud = pd.get_dummies(data=fraud, columns=['gender','city','category','merchant'])
fraud.drop(columns=['trans_date_trans_time','lat','long','job','merch_lat','merch_long','dob','is_fraud','state'], inplace=True)
fraud.shape

(83501, 781)

In [112]:
with open('app_model/dt_ny.pkl', 'rb') as pickle_in:
        dt = pickle.load(pickle_in)

In [117]:
dt.n_features_

781

## ```App Preperations```
---
1. Prepare tuples for streamlit selectbox 
2. Prepare merchant dict

In [124]:
sorted(tuple([city for city in fraud_df['city'].unique()]))

['Albany',
 'Allentown',
 'Altona',
 'Armonk',
 'Barneveld',
 'Beacon',
 'Belfast',
 'Bellmore',
 'Big Indian',
 'Breesport',
 'Bronx',
 'Brooklyn',
 'Brownville',
 'Camden',
 'Chatham',
 'Cottekill',
 'Cowlesville',
 'De Lancey',
 'Downsville',
 'East Rochester',
 'Falconer',
 'Farmington',
 'Garrattsville',
 'Greenport',
 'Hannawa Falls',
 'Howes Cave',
 'Hudson',
 'Irvington',
 'Jefferson',
 'Jordanville',
 'Knowlesville',
 'Lowville',
 'Margaretville',
 'Medford',
 'Melville',
 'Montrose',
 'Moriches',
 'Mount Morris',
 'Mount Vernon',
 'Nanuet',
 'New York City',
 'Newark Valley',
 'North Tonawanda',
 'Northport',
 'Oak Hill',
 'Oakdale',
 'Orangeburg',
 'Orient',
 'Oriskany Falls',
 'Palmyra',
 'Phelps',
 'Pomona',
 'Port Ewen',
 'Port Gibson',
 'Rock Tavern',
 'Saint Bonaventure',
 'Shelter Island',
 'South Richmond Hill',
 'Springfield Gardens',
 'Stittville',
 'Tupper Lake',
 'West Chazy',
 'West Eaton',
 'West Harrison',
 'West Henrietta',
 'West Sayville',
 'Westhampton Beac

In [44]:
sorted(tuple([category for category in fraud['category'].unique()]))

['entertainment',
 'food_dining',
 'gas_transport',
 'grocery_net',
 'grocery_pos',
 'health_fitness',
 'home',
 'kids_pets',
 'misc_net',
 'misc_pos',
 'personal_care',
 'shopping_net',
 'shopping_pos',
 'travel']

In [74]:
len(sorted(tuple([merch for merch in fraud['merchant'].unique()])))

693

In [122]:
cities = [col for col in X_train.columns if col.startswith('city')]
idx = list(range(0,67))
city_idx_dict = dict(zip(cities,idx))
city_idx_dict

{'city_pop': 0,
 'city_Albany': 1,
 'city_Allentown': 2,
 'city_Altona': 3,
 'city_Armonk': 4,
 'city_Barneveld': 5,
 'city_Beacon': 6,
 'city_Belfast': 7,
 'city_Bellmore': 8,
 'city_Big Indian': 9,
 'city_Breesport': 10,
 'city_Bronx': 11,
 'city_Brooklyn': 12,
 'city_Brownville': 13,
 'city_Camden': 14,
 'city_Chatham': 15,
 'city_Cottekill': 16,
 'city_Cowlesville': 17,
 'city_De Lancey': 18,
 'city_Downsville': 19,
 'city_East Rochester': 20,
 'city_Falconer': 21,
 'city_Farmington': 22,
 'city_Garrattsville': 23,
 'city_Greenport': 24,
 'city_Hannawa Falls': 25,
 'city_Howes Cave': 26,
 'city_Hudson': 27,
 'city_Irvington': 28,
 'city_Jefferson': 29,
 'city_Jordanville': 30,
 'city_Knowlesville': 31,
 'city_Lowville': 32,
 'city_Margaretville': 33,
 'city_Medford': 34,
 'city_Melville': 35,
 'city_Montrose': 36,
 'city_Moriches': 37,
 'city_Mount Morris': 38,
 'city_Mount Vernon': 39,
 'city_Nanuet': 40,
 'city_New York City': 41,
 'city_Newark Valley': 42,
 'city_North Tonawanda

In [56]:
print(len(fraud['category'].unique()))
print(X_train.shape)
X_train.columns[782-14:782]

14
(116124, 782)


Index(['merchant_fraud_Wuckert, Wintheiser and Friesen',
       'merchant_fraud_Wuckert-Goldner', 'merchant_fraud_Wuckert-Walter',
       'merchant_fraud_Yost, Block and Koepp',
       'merchant_fraud_Yost, Schamberger and Windler',
       'merchant_fraud_Yost-Rogahn', 'merchant_fraud_Zboncak LLC',
       'merchant_fraud_Zboncak Ltd',
       'merchant_fraud_Zboncak, Rowe and Murazik',
       'merchant_fraud_Zemlak Group',
       'merchant_fraud_Zemlak, Tillman and Cremin',
       'merchant_fraud_Ziemann-Waters',
       'merchant_fraud_Zieme, Bode and Dooley', 'merchant_fraud_Zulauf LLC'],
      dtype='object')

In [120]:
merchant = [col for col in X_train.columns if col.startswith('merchant')]
idx = list(range(0,693))
merchant_idx_dict = dict(zip(merchant,idx))
merchant_idx_dict

{'merchant_fraud_Abbott-Rogahn': 0,
 'merchant_fraud_Abbott-Steuber': 1,
 'merchant_fraud_Abernathy and Sons': 2,
 'merchant_fraud_Abshire PLC': 3,
 'merchant_fraud_Adams, Kovacek and Kuhlman': 4,
 'merchant_fraud_Adams-Barrows': 5,
 'merchant_fraud_Altenwerth, Cartwright and Koss': 6,
 'merchant_fraud_Altenwerth-Kilback': 7,
 'merchant_fraud_Ankunding LLC': 8,
 'merchant_fraud_Ankunding-Carroll': 9,
 'merchant_fraud_Armstrong, Walter and Gottlieb': 10,
 'merchant_fraud_Auer LLC': 11,
 'merchant_fraud_Auer-Mosciski': 12,
 'merchant_fraud_Auer-West': 13,
 'merchant_fraud_Bahringer Group': 14,
 'merchant_fraud_Bahringer, Bergnaum and Quitzon': 15,
 'merchant_fraud_Bahringer, Osinski and Block': 16,
 'merchant_fraud_Bahringer, Schoen and Corkery': 17,
 'merchant_fraud_Bahringer-Larson': 18,
 'merchant_fraud_Bahringer-Streich': 19,
 'merchant_fraud_Bailey-Morar': 20,
 'merchant_fraud_Balistreri-Nader': 21,
 'merchant_fraud_Barrows PLC': 22,
 'merchant_fraud_Bartoletti and Sons': 23,
 'merc