<a id="TableOfContents"></a>
    <h1 style='text-align: center'>
        <b>
            Table of Contents:
        </b></h1>
<li><a href='#imports'>Imports</a></li>
<li><a href="#summarize">Summarize</a></li>
<li><a href='#univariate'>Univariate</a></li>
<li><a href="#bivariate">Bivariate</a></li>
<li><a href='#multivariate'>Multivariate</a></li>
<li><a href="#stats_tests">Hypothesis Tests</a></li>
<li><a href="#clustering">Clustering</a></li>
<li><a href='#time_series'>Time Series</a></li>
<li><a href='#anomaly_detection'>Anomaly Detection</a></li>

<a id="imports"></a>
    <h1 style='text-align: center'>
        <b>
            Imports
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.cluster import KMeans

In [None]:
def get_age_visual(train):
    '''
    Actions: gets plot with the age density of those who had a stroke and those who did not have a stroke
    Modules:
        1. import seaborn as sns
        2. import matplotlib.pyplot as plt
    '''
    
    # getting two dataframes, one with only people who had a stroke a 
    no_stroke = train[train['stroke'] != 1]
    
    stroke = train[train['stroke'] == 1]
    

    # plotting both distibutions on the same figure
    fig = sns.kdeplot(stroke['age'], shade=True, color = "r", label = 'Stroke')
    fig = sns.kdeplot(no_stroke['age'], shade=True, color="b", label = 'No Stroke')
    plt.xlabel('Age')
    plt.title('Stroke Risk Higher with Age')
    plt.legend(loc='upper left')
    plt.show()
    
    return

<a id="summarize"></a>
    <h1 style='text-align: center'>
        <b>
            Summarize
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

<a id="univariate"></a>
    <h1 style='text-align: center'>
        <b>
            Univariate
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [None]:
cols = df.columns.to_list()
plt.figure(figsize=(20,20))
for i, col in enumerate(cols):
    # our plot numbers start at 1
    plot_number = i+1
    plt.subplot(2, round(len(cols) /2) +1, plot_number)
    plt.title(f'Distribution of {col}')
    df[col].hist(bins=10)
    plt.grid(False)
plt.show()

In [None]:
plt.figure(figsize=(20,10))
for i, col in enumerate(box_cols):
    # our plot numbers start at 1
    plot_number = i+1
    plt.subplot(2, round(len(cols) /2), plot_number)
    plt.title(f'Distribution of {col}')
    plt.grid(False)
    sns.boxplot(data=df[col])
plt.show()

<a id="bivariate"></a>
    <h1 style='text-align: center'>
        <b>
            Bivariate
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [None]:
def get_corr_heatmap(train):
    '''
    This function will display a heatmap of the potential correlations between variables in 
    our dataset
    '''
    # get the correlation values
    corr_matrix = train.corr()
    # create a plot
    plt.figure(figsize=(10,10))
    # plot a heatmap of the correlations
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    # add a title
    plt.title('Heat Map of Correlation')
    # display the plot
    plt.show()

In [None]:
def get_pairplot(df):
    '''
    This function will take in a dataFrame, and will display a pairplot of the variable
    relationships along with a regression line for each pair
    '''
    # take a sample of the dataFrame in order to cut down computing time
    plt.figure(figsize=(20,20))
    if len(df) > 500:
        sample = df.sample(500)
    else:
        sample = df
    # create a pairplot
    sns.pairplot(data=sample, corner=True, kind='reg', plot_kws={'color': 'blue'})
    
    plt.show()

<a id="multivariate"></a>
    <h1 style='text-align: center'>
        <b>
            Multivariate
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [None]:
def get_plots_multivar_scatter_by_target(train, target):
    for x_col in train.drop(columns=target).columns:
        for y_col in train.drop(columns=[target, x_col]).columns:
            sns.scatterplot(data=train.sample(500), x=x_col, y=y_col, 
                        hue=target, palette='magma')
            plt.show()

<a id="stats_tests"></a>
    <h1 style='text-align: center'>
        <b>
            Hypothesis Testing
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [None]:
def check_hypothesis_correlation(data, x, y, α=0.05, test=stats.pearsonr):
    '''
    This function will check the provided x and y variables from the 
    provided dataset (data) for statistical correlation according 
    to a pearsonsr test (this is changable by entering the desired test as a kwarg)
    '''
    # run the requested statistical test on variables x and y from data
    r, p = test(data[x], data[y])
    # if the resulting p-value is less than alpha, then reject the null hypothesis
    if p < α:
        # print results rejecting null hypothesis
        print(f"Since the p-value is less than {α}, \n\
we can reject the null hypothesis and conclude that {x} and {y} are correlated.")
        print(f"The correlation coefficient between \
{x} and {y} is {r:.2f} with a p-value of {p:.4f}")
        print('_______________________________________________________')
    # if p-value >= alpha, then we fail to reject the null hypothesis
    else:
        # print the results failing to reject the null hypothesis
        print(f"Since the p-value is greater than or equal to {α}, \n\
we fail to reject the null hypothesis and conclude \n\
that there is insufficient evidence to suggest a correlation between {x} and {y}.")
        print('_______________________________________________________')

<a id="clustering"></a>
    <h1 style='text-align: center'>
        <b>
            Clustering
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

In [None]:
def best_kmeans(data,k_max):
    '''
    EXAMPLE USEAGE:
    data = scaled_train[['alcohol', 'quality']]
    best_kmeans(data,k_max=10)
    
    This function will produce an elbow graph with clusters
    '''
    # create empty list variables to store results
    means = []
    inertia = []
    # cycle through our desired amount of k's
    for k in range(1, k_max):
        # create a KMeans object with current k
        kmeans = KMeans(n_clusters=k)
        # fit the kmeans object to our data
        kmeans.fit(data)
        # store the kmeans object in our means list
        means.append(k)
        # store the inertia for current k in the inertia list
        inertia.append(kmeans.inertia_)
        # create a figure
        fig =plt.subplots(figsize=(10,5))
        # plot the current k and inertia
        plt.plot(means,inertia, 'o-')
        # add axis labels
        plt.xlabel('means')
        plt.ylabel('inertia')
        # remove gridlines
        plt.grid(True)
        # display the plot
    plt.show()

Create clusters

In [None]:
def apply_kmeans(df, k):
    '''
    This function will create a clusters based on the given variables and df
    '''
    # create a kmeans object with k clusters
    kmeans = KMeans(n_clusters=k)
    # fit the kmeans object on our df
    kmeans.fit(df)
    # store the clustered df as a new column
    df[f'k_means_{k}'] = kmeans.labels_
    # return the modified dataframe
    return df

<a id="time_series"></a>
    <h1 style='text-align: center'>
        <b>
            Time Series Analysis
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>

<a id="anomaly_detection"></a>
    <h1 style='text-align: center'>
        <b>
            Anomaly Detection
        </b></h1>
<li><a href='#TableOfContents'>Table of Contents</a></li>