In [26]:
import numpy as np
import pandas as pd
import os
import env

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

### Acquire data from the customers table in the mall_customers database.

In [3]:
def acquire_mall_customers():
    file = 'mall_customers.csv'
    if os.path.exists(file):
        # check if local csv file of the data exists
        print('opening data from local file')
        df = pd.read_csv(file, index_col=0)
    else:
        # retrieve data from sql
        print('local file not found')
        print('retrieving data via SQL connection')
        query = 'SELECT * FROM customers;'
        connection = env.get_db_url('mall_customers')
        df = pd.read_sql(query, connection)
        df.to_csv(file)
        
    return df

In [32]:
df = acquire_mall_customers()

opening data from local file


In [33]:
df.head()

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


### Summarize the data (include distributions and descriptive statistics).

In [22]:
def nulls_by_col(df):
    '''
    This function takes in a dataframe 
    and finds the number of missing values
    it returns a new dataframe with quantity and percent of missing values
    '''
    num_missing = df.isnull().sum()
    rows = df.shape[0]
    percent_missing = num_missing / rows * 100
    cols_missing = pd.DataFrame({'num_rows_missing': num_missing, 'percent_rows_missing': percent_missing})
    return cols_missing.sort_values(by='num_rows_missing', ascending=False)

def nulls_by_row(df):
    '''
    This function takes in a dataframe 
    and finds the number of missing values in a row
    it returns a new dataframe with quantity and percent of missing values
    '''
    num_missing = df.isnull().sum(axis=1)
    percent_miss = num_missing / df.shape[1] * 100
    rows_missing = pd.DataFrame({'num_cols_missing': num_missing, 'percent_cols_missing': percent_miss})
    rows_missing = df.merge(rows_missing,
                        left_index=True,
                        right_index=True)[['num_cols_missing', 'percent_cols_missing']]
    return rows_missing.sort_values(by='num_cols_missing', ascending=False)

def summarize(df):
    '''
    summarize will take in a single argument (a pandas dataframe) 
    and output to console various statistics on said dataframe, including:
    # .head()
    # .info()
    # .describe()
    # .value_counts()
    # observation of nulls in the dataframe
    '''
    print('SUMMARY REPORT')
    print('=====================================================\n\n')
    print('Dataframe head: ')
    print(df.head(3))
    print('=====================================================\n\n')
    print('Dataframe info: ')
    print(df.info())
    print('=====================================================\n\n')
    print('Dataframe Description: ')
    print(df.describe())
    num_cols = [col for col in df.columns if df[col].dtype != 'O']
    cat_cols = [col for col in df.columns if col not in num_cols]
    print('=====================================================')
    print('DataFrame value counts: ')
    for col in df.columns:
        if col in cat_cols:
            print(df[col].value_counts(), '\n')
        else:
            print(df[col].value_counts(bins=10, sort=False), '\n')
    print('=====================================================')
    print('nulls in dataframe by column: ')
    print(nulls_by_col(df))
    print('=====================================================')
    print('nulls in dataframe by row: ')
    print(nulls_by_row(df))
    print('=====================================================')

In [34]:
summarize(df)

SUMMARY REPORT


Dataframe head: 
   customer_id  gender  age  annual_income  spending_score
0            1    Male   19             15              39
1            2    Male   21             15              81
2            3  Female   20             16               6


Dataframe info: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   customer_id     200 non-null    int64 
 1   gender          200 non-null    object
 2   age             200 non-null    int64 
 3   annual_income   200 non-null    int64 
 4   spending_score  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 9.4+ KB
None


Dataframe Description: 
       customer_id         age  annual_income  spending_score
count   200.000000  200.000000     200.000000      200.000000
mean    100.500000   38.850000      60.560000       50.200000
std      57.879185   13.969007  

### Detect outliers using IQR.

In [8]:
def detect_outliers(df, cols, k=1.5):
    '''
    This function takes in a dataframe, column, and k
    to detect and handle outlier using IQR rule
    '''
    for col in df[cols]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound =  q3 + k * iqr
        lower_bound =  q1 - k * iqr     
        df = df[(df[col] > upper_bound) | (df[col] < lower_bound)]
    return df

In [9]:
def remove_outliers(df, cols, k=1.5):
    '''
    This function takes in a dataframe, column, and k
    to detect and handle outlier using IQR rule
    '''
    for col in df[cols]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound =  q3 + k * iqr
        lower_bound =  q1 - k * iqr     
        df = df[(df[col] < upper_bound) & (df[col] > lower_bound)]
    return df

In [35]:
detect_outliers(df, df.columns)

Unnamed: 0,customer_id,gender,age,annual_income,spending_score


### Encode categorical columns using a one hot encoder (pd.get_dummies).

In [37]:
def encode_dummies(df):
    df = pd.get_dummies(df, columns=['gender'])
    return df

In [40]:
df = encode_dummies(df)
df

Unnamed: 0,customer_id,age,annual_income,spending_score,gender_Female,gender_Male
0,1,19,15,39,0,1
1,2,21,15,81,0,1
2,3,20,16,6,1,0
3,4,23,16,77,1,0
4,5,31,17,40,1,0
...,...,...,...,...,...,...
195,196,35,120,79,1,0
196,197,45,126,28,1,0
197,198,32,126,74,0,1
198,199,32,137,18,0,1


### Split data into train, validate, and test.

In [41]:
def split_data(df):
    train_val, test = train_test_split(df, train_size=0.8, random_state=123)
    train, validate = train_test_split(train_val, train_size=0.7, random_state=123)
    return train, validate, test

In [42]:
train, validate, test = split_data(df)

### Handles missing values.

In [21]:
df.isna().sum()

customer_id       0
age               0
annual_income     0
spending_score    0
gender_Female     0
gender_Male       0
dtype: int64

There are no nulls

### Scaling

In [25]:
scale_cols = ['age', 'annual_income']

In [27]:
def scale_data(train, 
               validate, 
               test, 
               columns_to_scale,
               scaler=MinMaxScaler(),
               return_scaler=False):
    '''
    Scales the 3 data splits. 
    Takes in train, validate, and test data splits and returns their scaled counterparts.
    If return_scalar is True, the scaler object will be returned as well
    '''
    # make copies of our original data so we dont gronk up anything
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    
    #     fit the thing
    scaler.fit(train[columns_to_scale])
    # applying the scaler:
    train_scaled[columns_to_scale] = pd.DataFrame(
        scaler.transform(train[columns_to_scale]),
        columns=train[columns_to_scale].columns.values, 
        index = train.index)
                                                  
    validate_scaled[columns_to_scale] = pd.DataFrame(
        scaler.transform(validate[columns_to_scale]),
        columns=validate[columns_to_scale].columns.values).set_index(
        [validate.index.values])
    
    test_scaled[columns_to_scale] = pd.DataFrame(
        scaler.transform(test[columns_to_scale]),
        columns=test[columns_to_scale].columns.values).set_index(
        [test.index.values])
    
    if return_scaler:
        return scaler, train_scaled, validate_scaled, test_scaled
    else:
        return train_scaled, validate_scaled, test_scaled

In [28]:
train_scaled, val_scaled, test_scaled = scale_data(train, validate, test, scale_cols)

In [29]:
train_scaled

Unnamed: 0,customer_id,gender,age,annual_income,spending_score
123,124,Male,0.403846,0.442623,91
76,77,Female,0.519231,0.319672,53
171,172,Male,0.192308,0.590164,75
10,11,Male,0.942308,0.032787,14
35,36,Female,0.057692,0.147541,81
...,...,...,...,...,...
175,176,Female,0.230769,0.598361,86
101,102,Female,0.596154,0.385246,48
148,149,Female,0.307692,0.516393,22
99,100,Male,0.038462,0.377049,49
