In [1]:
# fetches the data
import acquire
# credentials file to access the data
import env
# Imports functions necessary to run visuals and hides unnecessary code
import wrangle_zillow

# coding 
import math
import numpy as np
import seaborn as sns
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from pydataset import data
import scipy.stats
import scipy
import os

# needed for modeling
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import explained_variance_score
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
 

### Mall Customers
    Notebook

1) Acquire data from the customers table in the mall_customers database.

2) Summarize the data (include distributions and descriptive statistics).

3) Detect outliers using IQR.

4) Split data into train, validate, and test.

5) Encode categorical columns using a one hot encoder (pd.get_dummies).

6) Handles missing values.

7) Scaling

    Encapsulate your work in a wrangle_mall.py python module.

*** 

1) Acquire data from the customers table in the mall_customers database.

In [2]:
def acquire():
    database = 'mall_customers'
    url = f'mysql+pymysql://{env.user}:{env.password}@{env.host}/{database}'
    df = pd.read_sql('SELECT * FROM customers', url, index_col='customer_id')
    return df

In [3]:
df = acquire()

***

2) Summarize the data (include distributions and descriptive statistics).

In [4]:
def summarize(df):
    "combines these functions to summarize the data set given to us"
    a = print('--- Shape: {}'.format(df.shape))
    #returns the information from the data
    bb = print('--- Info:')
    b = print((df.info()))
     # describes the data
    c = print('--- Descriptions:', (df.describe()))
    # returns the sum of null values in columns
    d = print('--- Nulls by Column:', (df.isnull().sum()))
    # returns nulls by row
    e = print('nulls by row:', (pd.concat([df.isna().sum(axis=1).rename('n_missing'),df.isna().mean(axis=1).rename('percent_missing'),], axis=1).value_counts().sort_index()))
    print(a, bb,b, c, d, e)
summarize(df)

--- Shape: (200, 4)
--- Info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 1 to 200
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   gender          200 non-null    object
 1   age             200 non-null    int64 
 2   annual_income   200 non-null    int64 
 3   spending_score  200 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 7.8+ KB
None
--- Descriptions:               age  annual_income  spending_score
count  200.000000     200.000000      200.000000
mean    38.850000      60.560000       50.200000
std     13.969007      26.264721       25.823522
min     18.000000      15.000000        1.000000
25%     28.750000      41.500000       34.750000
50%     36.000000      61.500000       50.000000
75%     49.000000      78.000000       73.000000
max     70.000000     137.000000       99.000000
--- Nulls by Column: gender            0
age               0
annual_income     0
spendin

***

3) Detect outliers using IQR.

In [5]:
def nulls_by_columns(df):
    # gives us a count and a percent of missing information.
    return pd.concat([
        df.isna().sum().rename('count'),
        df.isna().mean().rename('percent')
    ], axis=1)
nulls_by_columns(df).sort_values(by= 'percent', ascending=False)

Unnamed: 0,count,percent
gender,0,0.0
age,0,0.0
annual_income,0,0.0
spending_score,0,0.0


In [6]:
def outlier_function(df, cols, k):
    for col in df[cols]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound = q3 + k * iqr
        lower_bound = q1 - k * iqr
        df = df[(df[col] < upper_bound) & (df[col] > lower_bound)]
        return df

def handle_missing_value(df, prop_required_column, prop_required_row):
    #this piece of code allows us to handle the missing data and get rid of it, both in the columns and in the rows(so that we can analize better).
    n_required_column = round(df.shape[0] * prop_required_column)
    n_required_row = round(df.shape[1] * prop_required_row)
    df = df.dropna(axis=0, thresh=n_required_row)
    df = df.dropna(axis=1, thresh=n_required_column)
    return df

In [8]:
#outlier_function(df, df.columns, k = 1.5)

4) Split data into train, validate, and test.

In [10]:
def get_exploration_data():
    df = acquire()
    print ('Before dropping nulls, %d rows, %d cols' % df.shape)
    df = handle_missing_values(af, prop_required_column=.5, prop_required_row=.5)
    print('After dropping nulls. %d rows. %d cols' % df.shape)
    train, validate, test = split(df)
    return train
##############################################################################################
def get_modeling_data(scale_data=False):
    df = acquire()
    print('Before dropping nulls, %d rows, %d cols' % df.shape)
    df = handle_missing_values(df, prop_required_column=.5, prop_required_row=.5)
    print('After dropping nulls, sd rows, %d cols' % df.shape)
    
    print()
    
    print('Before removing outliers, %d rows, %d cols' % df. shape)
    outlier_function(df, ['age','spending_score','annual_income'], 1.5)
    print('after dropping nulls, %d rows, %d cols' % df.shape)
    print()
    
    df = one_hot_encode(df)
    
    train, validate, test = split(df)
    if scale_data:
        return scale(train, validate, test)
    else:
        train, validate, test
##############################################################################################
def split(df):
    train_and_validate, test = train_test_split(df, random_state=13, test_size=.15)
    train, validate = train_test_split(train and validate, random_state=13, test_size=.2)
    print('Train: %d rows, %d cols' % train.shape)
    print ('Validate: %d rows, %d cols' % validate. shape)
    print ('Test: %d rows, %d cols' % test.shape)
    
    return train, validate, test
##############################################################################################
def scale(train, validate, test):
    columns_to_scale = ['age','spending_score', 'annual_income']
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()
    scaler = MinMaxScaler()
    scaler.fit(train[columns_to_scale])
    train_scaled[columns_to_scale] = scaler.transform(train[columns_to_scale])
    validate_scaled[columns_to_scale] = scaler.transform(validate[columns_to_scale])
    test_scaled[columns_to_scale] = scaler.transform(test[columns_to_scale])
    return scaler, train_scaled, validate_scaled, test_scaled
##############################################################################################
def one_hot_encode(df):
    df['is_female'] = df.gender == 'Female'
    df = df.drop(columns='gender')
    return df
##############################################################################################
def handle_missing_values(df, prop_required_column, prop_required_row):
    n_required_column = round(df.shape[0] * prop_required_column)
    n_required_row = round(df.shape[1] * prop_required_row)
    df = df.dropna(axis=0, thresh=n_required_row)
    df = df.dropna(axis=1, thresh=n_required_column)
    return df
##############################################################################################
def handle_outliers(df, cols, k):
    # Create placeholder dictionary for each columns bounds
    bounds_dict = {}

    # get a list of all columns that are not object type
    non_object_cols = df.dtypes[df.dtypes != 'object'].index


    for col in non_object_cols:
        # get necessary iqr values
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound =  q3 + k * iqr
        lower_bound =  q1 - k * iqr

        #store values in a dictionary referencable by the column name
        #and specific bound
        bounds_dict[col] = {}
        bounds_dict[col]['upper_bound'] = upper_bound
        bounds_dict[col]['lower_bound'] = lower_bound

    for col in non_object_cols:
        #retrieve bounds
        col_upper_bound = bounds_dict[col]['upper_bound']
        col_lower_bound = bounds_dict[col]['lower_bound']

        #remove rows with an outlier in that column
        df = df[(df[col] < col_upper_bound) & (df[col] > col_lower_bound)]
    
    return df
##############################################################################################
def split(df):
    train_and_validate, test = train_test_split(df, random_state=13, test_size=.15)
    train, validate = train_test_split(train_and_validate, random_state=13, test_size=.2)

    print('Train: %d rows, %d cols' % train.shape)
    print('Validate: %d rows, %d cols' % validate.shape)
    print('Test: %d rows, %d cols' % test.shape)

    return train, validate, test
##############################################################################################
def scale(train, validate, test):
    columns_to_scale = [] #define columns and input here.
    train_scaled = train.copy()
    validate_scaled = validate.copy()
    test_scaled = test.copy()

    scaler = MinMaxScaler()
    scaler.fit(train[columns_to_scale])

    train_scaled[columns_to_scale] = scaler.transform(train[columns_to_scale])
    validate_scaled[columns_to_scale] = scaler.transform(validate[columns_to_scale])
    test_scaled[columns_to_scale] = scaler.transform(test[columns_to_scale])

    return scaler, train_scaled, validate_scaled, test_scaled
