# Modeling

**University of San Diego, M.S. Applied Data Science**

Lai Leng Chan, Minsu Kim, Christopher Garcia

This notebook contains predictive  models, model validation as well as model evalaution. 

In [5]:
# Import required libraries and packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

# Preprocessing Packages
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Modeling packages
from sklearn.metrics import log_loss 
from sklearn.metrics import precision_recall_curve, average_precision_score 
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report 

# Algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

# Set seed for reproducibility
import random
random.seed(101)

In [2]:
def scale_features(X, scaling_method='minmax'):
    """
    Scale the features in X using Standardization.

    Parameters:
        X (DataFrame): The features DataFrame to be scaled.
        scaling_method (str): The scaling used is Standardization

    Returns:
        DataFrame: The scaled features DataFrame.
    """

    if scaling_method not in ['standard']:
        raise ValueError("Invalid scaling_method. Options 'standard'.")
    else:
        scaler = StandardScaler()

    # Fit and transform the features using the chosen scaler
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return X_scaled

### Read Data

In [4]:
df = pd.read_csv('/Users/christophergarcia/Documents/GitHub/MSADS-Capstone-CryptoCurrencyFraudDetection/Data/new_data_file.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/christophergarcia/Documents/GitHub/MSADS-Capstone-CryptoCurrencyFraudDetection/Data/new_data_file.csv'

## Train-Test Split

The data is split into <number> trianing set, <> for a validation set as well as <number> for a testing set. This ensures we have enough data set to test our models since we do not have a large number of observations. The data sets are then saved into separete csv files in order to prepare for modeling.

In [28]:
# Create training and testing datasets
# Save to data folder for modeling
# Set random number for reproducibiltiy
random_state = 111

# Features to be used for model
features = df[['total ether received', 'avg val received', 'Unique Received From Addresses', 'Time Diff between first and last (Mins)', 
               'Received Tnx', 'total transactions (including tnx to create contract', 'Avg min between received tnx', 'Sent tnx',
               'total ether balance', 'Avg min between sent tnx']]
target_feature = df['FLAG']

In [29]:
# Split the data into training (80%) and temporary rest (20%)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(features, target_feature, 
                                                              test_size=0.2, 
                                                              random_state=42,
                                                              stratify=target_feature)

# Split the temporary rest into validation (50%) and testing (50%)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, 
                                                  test_size=0.5, 
                                                  random_state=42)

In [30]:
# Check shape of data
print('Training Size:', len(X_train))
print('Validation Size:', len(X_val))
print('Testing Size', len(X_test))

Training Size: 3936
Validation Size: 3936
Testing Size 1969


In [42]:
# Save new features to 
X_scaled_train = scale_features(X_train, scaling_method='standard')
X_scaled_val = scale_features(X_val, scaling_method='standard')
X_scaled_test = scale_features(X_test, scaling_method='standard')

In [43]:
# Check shape of data
print('Training Size:', len(X_scaled_train))
print('Validation Size:', len(X_scaled_val))
print('Testing Size', len(X_scaled_test))

Training Size: 3936
Validation Size: 3936
Testing Size 1969


## Models

### Model 1: Logistic Regression

In [None]:
# Build the logistic regression
logistic_model = LogisticRegression(random_state=random_state)

# Fit the model to the training data
logistic_model.fit(X_scaled_train, y_train)

# Predictions on the validation data
y_val_pred = logistic_model.predict(X_scaled_val)

### Model 2: Random Forest

### Model 3

## Evaluation

### Confusion Matrix