# Workflow and Automation Functions

This notebook is dedicated to setting various functions that will assist us in EDA, modeling, performance tuning, and interpretation of results for this project.

In [3]:
# importing relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

## metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

### Find NA values

This function will find all NA values, and capture the sum of NAs in their respective column, in descending order, **only** if there are NAs present.

**Input:** Dataframe <br />
**Output:** Series of NA counts for each column

In [12]:
def na_only(df):
    na_ser = df.isna().sum().sort_values(ascending=False)[lambda x: x > 0]
    if na_ser.empty:
        return 0
    else:
        return na_ser

### Find Correlations

In [13]:
def find_corr(df, threshold):
    return df.corr().style.applymap(lambda x: 'color: red' if ((x > threshold or x < (threshold * -1)) and x != 1) else 'color: black')

### Find Positive Correlations

This function will return the positively correlated variables relative to the response variable of choice, in descending order.

**Input:** Dataframe, String name of response variable <br />
**Output:** Series of positively correlated features relative to the response

In [14]:
def pos_cor(df, response):
    all_cor = df.corr()[response].sort_values(ascending=False)[1:] #exclude the response itself from the series
    return pd.Series({k:v for k, v in all_cor.items() if v > 0})

### Find Negative Correlations

This function will return the negatively correlated variables relative to the response variable of choice, in ascending order.

**Input:** Dataframe, String name of response variable <br />
**Output:** Series of negatively correlated features relative to the response

In [15]:
def neg_cor(df, response):
    all_cor = df.corr()[response].sort_values(ascending=False)[1:] #exclude the response itself from the series
    return pd.Series({k:v for k, v in all_cor.items() if v < 0}).sort_values(ascending=True)

### Model Metrics

This function prints an aggregate of model performance metrics:
- Coefficient of Determination ($R^2$)
- Mean Absolute Error ($MAE$)
- Root Mean Squared Error ($RMSE$)
- Mean Squared Error ($MSE$)

This function can be used for both testing and training metrics. Ensure that the response and prediction data are of the same dimension.

**Input:** Response data, prediction data, boolean that indicates training or testing data <br />
**Output:** Printout of all metrics, distinguished as training or test (aka "unseen") data

In [16]:
def metrics(y, preds, is_train=False):
    r2 = r2_score(y, preds)
    mse = mean_squared_error(y, preds)
    rmse = mean_squared_error(y, preds, squared=False)
    mae = mean_absolute_error(y, preds)
    
    if is_train:
        print(f'''
        Training Data Performance
        ------
        Coefficient of Determination: {r2}
        Mean Absolute Error: {mae}
        Root Mean Squared Error: {rmse}
        Mean Squared Error: {mse}
        ''')
    else:
        print(f'''
        Unseen Data Performance
        ------
        Coefficient of Determination: {r2}
        Mean Absolute Error: {mae}
        Root Mean Squared Error: {rmse}
        Mean Squared Error: {mse}
        ''')

### API Call

In [19]:
params = {
    'subbreddit': 'lifehacks',
    'size': 500,
    'sort': 'desc',
    'sort_type': 'created_utc',
    'metadata': True,
    'is_video': False,
    'after': '',
    'before': ''
}

In [18]:
def api_call(subreddit, size=25, before=''):
    url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size={size}&metadata=True&is_video=False&before={before}"
    req = requests.get(url)
    
    if req.status_code != 200:
        return "Error: API call failed."
    else:
        call = req.json()
        return call['data']

## Data Wrangling

In [21]:
def data_wrangling(dict_, keys, api_call):
    error_log = [] #used to capture indices with missing data
    for i in range(len(api_call)):
        for key in keys:
            try:
                dict_[key].append(api_call[i][key])
            except:
                error_log.append(f"Error on index: {i}\nkey \"{key}\" not found.")
                dict_[key].append(None) #if there is not data, set it to null
    return {'data': dict_, 'error_log': error_log}

## Model Instantiation and Performance

In [6]:
def make_model(estimator, X_train, X_test, y_train, y_test):
    estimator.fit(X_train, y_train)
    preds = estimator.predict(X_test)
    
    print(f'''
        Training Accuracy Score: {estimator.score(X_train, y_train)}
        Test Accuracy Score: {estimator.score(X_train, y_train)}
        
        --- Performance on unseen data ----
        Balance Accuracy: {balanced_accuracy_score(y_test, preds)}
        Recall (Sensitivity): {recall_score(y_test, preds)}
        Specificity: {recall_score(y_test, preds, pos_label=0)}
        Precision: {precision_score(y_test, preds)}
        F1 Score: {f1_score(y_test, preds)}
        ''')