In [5]:
import numpy as np
import json

In [6]:
j_file = json.load(open("data/Mall_Customers_doc_source.json","r"))

In [7]:
j_file["DOC"]

{'CustomerID': 'Unique ID assigned to the customer',
 'Gender': 'Gender of the customer',
 'Age': 'Age of the customer',
 'Annual Income (k$)': 'Annual Income of the customer',
 'Spending Score (1-100)': 'Score assigned by the mall based on customer behavior and spending nature'}

# Mall customers: simple numpy use-case

## import all columns of the "Mall_Customers.csv" dataset with numpy (loadtxt)

In [8]:
mall_customers = np.loadtxt('data/Mall_Customers.csv', dtype='str', delimiter=',', usecols=(0, 1, 2, 3), unpack=True)
mall_customers

array([['CustomerID', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
        '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21',
        '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32',
        '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43',
        '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54',
        '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65',
        '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76',
        '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87',
        '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98',
        '99', '100', '101', '102', '103', '104', '105', '106', '107',
        '108', '109', '110', '111', '112', '113', '114', '115', '116',
        '117', '118', '119', '120', '121', '122', '123', '124', '125',
        '126', '127', '128', '129', '130', '131', '132', '133', '134',
        '135', '136', '137', '138', '139', '140', '1

### check memory footprint of values (nbytes)
- how can we reduce the memory footprint (without loosing data) ?

In [13]:
# check memory footprint of values
memory = mall_customers.nbytes
memory

57888

### select the first 6 rows from each columns

### select data from males with age above 30

### create a 'give_statitics' function
- returns an dictionary with :
    - mean
    - std
    - median
    - min
    - max
    - count of non empty
### what are the statistics for the (whole) dataset ?

### create function that does a group by

In [14]:
mall_customers[:, 0:6]

array([['CustomerID', '1', '2', '3', '4', '5'],
       ['Gender', 'Male', 'Male', 'Female', 'Female', 'Female'],
       ['Age', '19', '21', '20', '23', '31'],
       ['Annual Income (k$)', '15', '15', '16', '16', '17']], dtype='<U18')

In [15]:
gender = mall_customers[1]
age = mall_customers[2]

males = gender == 'Male'
above_30 = age >= '30'
males_above_30 = mall_customers[:, males & above_30]
males_above_30.T

array([['9', 'Male', '64', '19'],
       ['11', 'Male', '67', '19'],
       ['15', 'Male', '37', '20'],
       ['19', 'Male', '52', '23'],
       ['21', 'Male', '35', '24'],
       ['24', 'Male', '31', '25'],
       ['28', 'Male', '35', '28'],
       ['31', 'Male', '60', '30'],
       ['33', 'Male', '53', '33'],
       ['43', 'Male', '48', '39'],
       ['52', 'Male', '33', '42'],
       ['54', 'Male', '59', '43'],
       ['56', 'Male', '47', '43'],
       ['58', 'Male', '69', '44'],
       ['60', 'Male', '53', '46'],
       ['61', 'Male', '70', '46'],
       ['65', 'Male', '63', '48'],
       ['71', 'Male', '70', '49'],
       ['75', 'Male', '59', '54'],
       ['78', 'Male', '40', '54'],
       ['81', 'Male', '57', '54'],
       ['82', 'Male', '38', '54'],
       ['83', 'Male', '67', '54'],
       ['86', 'Male', '48', '54'],
       ['93', 'Male', '48', '60'],
       ['99', 'Male', '48', '61'],
       ['103', 'Male', '67', '62'],
       ['105', 'Male', '49', '62'],
       ['108', 'Mal

In [17]:
def give_statistics(arr):
    ages = arr[2, 1:].astype(np.float64)
    annual_income = arr[3, 1:].astype(np.float64)
    return {'ages mean': np.mean(ages),
            'ages std': np.std(ages),
            'ages median': np.median(ages),
            'ages min': np.min(ages),
            'ages max': np.max(ages),
            'ages non_empty': np.count_nonzero(ages),
            'income mean': np.mean(annual_income),
            'income std': np.std(annual_income),
            'income median': np.median(annual_income),
            'income min': np.min(annual_income),
            'income max': np.max(annual_income),
            'income non_empty': np.count_nonzero(annual_income),
           }

give_statistics(mall_customers)

{'ages mean': np.float64(38.85),
 'ages std': np.float64(13.934041050606963),
 'ages median': np.float64(36.0),
 'ages min': np.float64(18.0),
 'ages max': np.float64(70.0),
 'ages non_empty': 200,
 'income mean': np.float64(60.56),
 'income std': np.float64(26.19897707926781),
 'income median': np.float64(61.5),
 'income min': np.float64(15.0),
 'income max': np.float64(137.0),
 'income non_empty': 200}

In [18]:
def group_by(arr, aggregation=None, colonne=None, value=None, target=None):
    source_col = None
    target_col = None
    for i in range(arr.shape[0]):
        if arr[i][0] == colonne:
            source_col = arr[i, 1:].astype(np.float64)
        if arr[i][0] == target:
            target_col = arr[i, 1:].astype(np.float64)

    if aggregation not in ["mean", "std", "median", "min", "max"]:
        raise ValueError(f"{aggregation} is not a valid aggregation")

    aggregation_functions = {
        "mean": np.mean,
        "std": np.std,
        "median": np.median,
        "min": np.min,
        "max": np.max
    }
    
    if aggregation not in aggregation_functions:
        raise ValueError(f"{aggregation} is not a supported aggregation")

    condition_col = source_col == float(value)

    solution = aggregation_functions[aggregation](target_col[condition_col])
    
    print(target_col[condition_col])
    return {f"{aggregation.capitalize()} {target}": float(solution)}

group_by(mall_customers, aggregation="std", colonne="Age", value="19", target="Annual Income (k$)")

[15. 46. 48. 63. 64. 65. 74. 81.]


{'Std Annual Income (k$)': 19.300259065618782}