# Predicting T-shirt size using the ANSUR II dataset
We will here try to predict a persons t-shirt size given the weight and height of the person. We will use the ANSUR II dataset which contains a lot of information about the physical attributes of a large number of people. 

We will first try to map the persons in the dataset to a t-shirt size. It is hard to find a concise size chart for t-shirt so we will create our own, initial chart, based on these assumptions:

We will only look at two measurements, Shoulder Width and Chest Circumference.

Our first problem is that Shoulder Width is not one of the measurements taken in the dataset. But we have Biacromial Breadth which is the distance between the two acromion processes. We will assume that this is the same as Shoulder Width.

We will then have these initial rules:

| Size | Percentile |
|------|------------|
| XS   | 0-5        |
| S    | 5-25       |
| M    | 25-50      |
| L    | 50-75      |
| XL   | 75-90      |
| XXL  | 90-97      |
| XXXL | 97-100     |

## Inspect the data

In [13]:
import pandas as pd

In [14]:
female = pd.read_csv('../data/female.csv')
male = pd.read_csv('../data/male.csv')

In [15]:
print(f'For women we have (rows, columns) {female.shape}')
print(f'For men we have (rows, columns) {male.shape}')


For women we have (rows, columns) (1986, 108)
For men we have (rows, columns) (4082, 108)


## Checking the percentiles

Let us determine the percentiles of the data.

In [16]:
def compute_percentile_ranges(column):
    # Define percentile ranges
    ranges = [(0, 5), (5, 25), (25, 50), (50, 75), (75, 90), (90, 97), (97, 100)]

    percentiles = {(low, high): (column.quantile(low/100), column.quantile(high/100)) for low, high in ranges}

    counts = {}

    for r, (low, high) in percentiles.items():
        counts[r] = int(((column >= low) & (column < high)).sum())
    
    return counts

print(compute_percentile_ranges(female['chestcircumference']))
print(compute_percentile_ranges(female['biacromialbreadth']))
print()
print(compute_percentile_ranges(male['chestcircumference']))
print(compute_percentile_ranges(male['biacromialbreadth']))

{(0, 5): 100, (5, 25): 396, (25, 50): 492, (50, 75): 499, (75, 90): 299, (90, 97): 140, (97, 100): 59}
{(0, 5): 93, (5, 25): 377, (25, 50): 477, (50, 75): 541, (75, 90): 297, (90, 97): 139, (97, 100): 61}

{(0, 5): 199, (5, 25): 810, (25, 50): 1025, (50, 75): 1012, (75, 90): 616, (90, 97): 295, (97, 100): 124}
{(0, 5): 191, (5, 25): 787, (25, 50): 989, (50, 75): 1079, (75, 90): 610, (90, 97): 303, (97, 100): 122}


## Generate the t-shirt size chart

In [17]:
def compute_size_percentile_mesurments(data, chest_column, shoulder_column):
    sizes = ['XS', 'S', 'M', 'L', 'XL', '2XL', '3XL']
    ranges = [0, 5, 25, 50, 75, 90, 97]

    # Compute the values for each percentile for chest and shoulder
    chest_percentiles = {p: data[chest_column].quantile(p/100) for p in ranges} # p is a percentile in ranges, and the value is the threshold below which p% of the data falls
    shoulder_percentiles = {p: data[shoulder_column].quantile(p/100) for p in ranges}

    # Map the t-shirt sizes to the corresponding chest and shoulder measurments
    size_mappings = {}
    for i, size in enumerate(sizes): # i for value in ranges, size for key in sizes
        # Create a dictionary where the key is 'size' and with the chest and shoulder measurments for each size 
        size_mappings[size] = {
            'Chest': int(chest_percentiles[ranges[i]]),
            'Shoulder': int(shoulder_percentiles[ranges[i]])
        }
    
    return size_mappings
    


print(compute_size_percentile_mesurments(female, 'chestcircumference', 'biacromialbreadth'))
print(compute_size_percentile_mesurments(male, 'chestcircumference', 'biacromialbreadth'))


{'XS': {'Chest': 695, 'Shoulder': 283}, 'S': {'Chest': 824, 'Shoulder': 335}, 'M': {'Chest': 889, 'Shoulder': 353}, 'L': {'Chest': 940, 'Shoulder': 365}, 'XL': {'Chest': 999, 'Shoulder': 378}, '2XL': {'Chest': 1057, 'Shoulder': 389}, '3XL': {'Chest': 1117, 'Shoulder': 400}}
{'XS': {'Chest': 774, 'Shoulder': 337}, 'S': {'Chest': 922, 'Shoulder': 384}, 'M': {'Chest': 996, 'Shoulder': 403}, 'L': {'Chest': 1056, 'Shoulder': 415}, 'XL': {'Chest': 1117, 'Shoulder': 428}, '2XL': {'Chest': 1172, 'Shoulder': 441}, '3XL': {'Chest': 1233, 'Shoulder': 452}}


In [18]:
# {'XS': {'Chest': 695, 'Shoulder': 283}, 'S': {'Chest': 824, 'Shoulder': 335}, 'M': {'Chest': 889, 'Shoulder': 353}, 'L': {'Chest': 940, 'Shoulder': 365}, 'XL': {'Chest': 999, 'Shoulder': 378}, '2XL': {'Chest': 1057, 'Shoulder': 389}, '3XL': {'Chest': 1117, 'Shoulder': 400}}

female_sizes = {
    'XS': {'Chest': 695, 'Shoulder': 283}, 
    'S': {'Chest': 824, 'Shoulder': 335}, 
    'M': {'Chest': 889, 'Shoulder': 353}, 
    'L': {'Chest': 940, 'Shoulder': 365}, 
    'XL': {'Chest': 999, 'Shoulder': 378}, 
    '2XL': {'Chest': 1057, 'Shoulder': 389}, 
    '3XL': {'Chest': 1117, 'Shoulder': 400}
    }

male_sizes = {
    'XS': {'Chest': 774, 'Shoulder': 337}, 
    'S': {'Chest': 922, 'Shoulder': 384}, 
    'M': {'Chest': 996, 'Shoulder': 403}, 
    'L': {'Chest': 1056, 'Shoulder': 415}, 
    'XL': {'Chest': 1117, 'Shoulder': 428}, 
    '2XL': {'Chest': 1172, 'Shoulder': 441}, 
    '3XL': {'Chest': 1233, 'Shoulder': 452}
    }

In [19]:
def get_size(data, size_chart):
    matches = {size: 0 for size in size_chart.keys()}   
    ties = 0

    for _, row in data.iterrows():
        possible_sizes = []

        for size, measurments in size_chart.items():
            if row['chestcircumference'] <= measurments['Chest'] and row['biacromialbreadth'] >= measurments['Shoulder']:
                possible_sizes.append(size)

        if len(possible_sizes) == 1:
            matches[possible_sizes[0]] += 1
        elif len(possible_sizes) > 1:
            ties += 1

    return matches, ties

female_matches, female_ties = get_size(female, female_sizes)
male_matches, male_ties = get_size(male, male_sizes)

print(f'Female matches: \n {female_matches} \nTies:  {female_ties}\n')
print(f'Male matches: \n {male_matches} \nTies:  {male_ties}')

Female matches: 
 {'XS': 0, 'S': 35, 'M': 115, 'L': 142, 'XL': 77, '2XL': 34, '3XL': 10} 
Ties:  390

Male matches: 
 {'XS': 1, 'S': 70, 'M': 228, 'L': 286, 'XL': 164, '2XL': 69, '3XL': 38} 
Ties:  625


This is not good. Let us have overlapping measurments.

In [20]:
def create_overlapping_size_chart(original_chart):
    overlapping_chart = {} # Create an empty dictionary to store the overlapping sizes

    sizes = list(original_chart.keys()) # Get the sizes from the original size chart

    for i, size in enumerate(sizes):
        overlapping_chart[size] = {} 
        if i == 0: 
           # first size
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest'], original_chart[sizes[i+1]]['Chest']+5]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder'], original_chart[sizes[i+1]]['Shoulder']+5]

        # last size
        elif i == len(sizes)-1:
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest']-5, original_chart[size]['Chest']+1000]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder']-5, original_chart[size]['Shoulder']+1000]
        
        # all other sizes
        else:
            overlapping_chart[size]['Chest'] = [original_chart[size]['Chest']-5, original_chart[sizes[i+1]]['Chest']+5]
            overlapping_chart[size]['Shoulder'] = [original_chart[size]['Shoulder']-5, original_chart[sizes[i+1]]['Shoulder']+5]

    return overlapping_chart

new_female_sizes = create_overlapping_size_chart(female_sizes)
new_male_sizes = create_overlapping_size_chart(male_sizes)

for k,v in new_female_sizes.items():
    print(f'{k}: {v}')

print()

for k,v in new_male_sizes.items():
    print(f'{k}: {v}')

XS: {'Chest': [695, 829], 'Shoulder': [283, 340]}
S: {'Chest': [819, 894], 'Shoulder': [330, 358]}
M: {'Chest': [884, 945], 'Shoulder': [348, 370]}
L: {'Chest': [935, 1004], 'Shoulder': [360, 383]}
XL: {'Chest': [994, 1062], 'Shoulder': [373, 394]}
2XL: {'Chest': [1052, 1122], 'Shoulder': [384, 405]}
3XL: {'Chest': [1112, 2117], 'Shoulder': [395, 1400]}

XS: {'Chest': [774, 927], 'Shoulder': [337, 389]}
S: {'Chest': [917, 1001], 'Shoulder': [379, 408]}
M: {'Chest': [991, 1061], 'Shoulder': [398, 420]}
L: {'Chest': [1051, 1122], 'Shoulder': [410, 433]}
XL: {'Chest': [1112, 1177], 'Shoulder': [423, 446]}
2XL: {'Chest': [1167, 1238], 'Shoulder': [436, 457]}
3XL: {'Chest': [1228, 2233], 'Shoulder': [447, 1452]}


In [21]:
female_sizes = {
    'XS': {'Chest': [695, 829], 'Shoulder': [283, 340]},
    'S': {'Chest': [819, 894], 'Shoulder': [330, 358]},
    'M': {'Chest': [884, 945], 'Shoulder': [348, 370]},
    'L': {'Chest': [935, 1004], 'Shoulder': [360, 383]},
    'XL': {'Chest': [994, 1062], 'Shoulder': [373, 394]},
    '2XL': {'Chest': [1052, 1122], 'Shoulder': [384, 405]},
    '3XL': {'Chest': [1112, 2117], 'Shoulder': [395, 1400]},
}

male_sizes = {
    'XS': {'Chest': [774, 927], 'Shoulder': [337, 389]},
    'S': {'Chest': [917, 1001], 'Shoulder': [379, 408]},
    'M': {'Chest': [991, 1061], 'Shoulder': [398, 420]},
    'L': {'Chest': [1051, 1122], 'Shoulder': [410, 433]},
    'XL': {'Chest': [1112, 1177], 'Shoulder': [423, 446]},
    '2XL': {'Chest': [1167, 1238], 'Shoulder': [436, 457]},
    '3XL': {'Chest': [1228, 2233], 'Shoulder': [447, 1452]},
}

# Todays Lab
 
Last time, we created a function get_size to get a clearer view of how many matches and ties we had. Now, I want you to do the same thing for the new size charts we created, but this time taking into consideration that we have two measurements instead of one. The goal remains the same: find out how many matches and how many ties we have.
 
### Task
Analyze the data: Use the new size charts to determine the number of matches and ties based on two measurements.
Count matches and ties: Write a function that iterates through each person's measurements, compares them with the new size charts, and counts the number of matches and ties.
### Bonus
Modify the function to handle ties. If there is a tie and the sizes are adjacent, choose the larger size to increase the number of matches.

In [22]:
# Task
# Analyze the data: Use the new size charts to determine the number of matches and ties based on two measurements.
# Count matches and ties: Write a function that iterates through each person's measurements, compares them with the new size charts, and counts the number of matches and ties.

def count_matches_and_ties(data, size_chart):
    matches = {size: 0 for size in size_chart.keys()}
    ties = 0
    total = 0
    else_count = 0

    for _, row in data.iterrows():
        posible_sizes = []

        for size, measurements in size_chart.items():
            if row['chestcircumference'] >= measurements['Chest'][0] and row['chestcircumference'] < measurements['Chest'][1] and row['biacromialbreadth'] >= measurements['Shoulder'][0] and row['biacromialbreadth'] < measurements['Shoulder'][1]:
                posible_sizes.append(size)

        if len(posible_sizes) == 1:
            matches[posible_sizes[0]] += 1
        elif len(posible_sizes) > 1:
            # If there is a tie and the sizes are adjacent, choose the larger size to increase the number of matches.
            if posible_sizes[0] == 'XS' and posible_sizes[1] == 'S':
                matches['S'] += 1
            elif posible_sizes[0] == 'S' and posible_sizes[1] == 'M':
                matches['M'] += 1
            elif posible_sizes[0] == 'M' and posible_sizes[1] == 'L':
                matches['L'] += 1
            elif posible_sizes[0] == 'L' and posible_sizes[1] == 'XL':
                matches['XL'] += 1
            elif posible_sizes[0] == 'XL' and posible_sizes[1] == '2XL':
                matches['2XL'] += 1
            elif posible_sizes[0] == '2XL' and posible_sizes[1] == '3XL':
                matches['3XL'] += 1
                
            ties += 1
        else:
            if len(posible_sizes) == 0:
                else_count += 1
                # print(f"Unmatched: Chest={row['chestcircumference']}, Shoulder={row['biacromialbreadth']}")
# pepo        total += 1

    print(f'Matches: {matches}')
    print(f'Ties: {ties}')
    print(f'Else: {else_count}')
    print(f'Total: {total}')


    return matches, ties
                
m, t = count_matches_and_ties(female, new_female_sizes)
mm, tt = count_matches_and_ties(male, new_male_sizes)

Matches: {'XS': 20, 'S': 178, 'M': 228, 'L': 262, 'XL': 111, '2XL': 35, '3XL': 13}
Ties: 56
Else: 1139
Total: 0
Matches: {'XS': 60, 'S': 407, 'M': 540, 'L': 569, 'XL': 323, '2XL': 98, '3XL': 50}
Ties: 144
Else: 2035
Total: 0


# Lecture anser

In [25]:
def get_size(data, size_chart):
    matches ={size: 0 for size in size_chart.keys()}
    ties = 0
 
    size_ordered = list(size_chart.keys())
 
    for _, row in data.iterrows():
        possible_sizes = []
 
        for size, measurments in size_chart.items():
            if( row['biacromialbreadth'] >= measurments['Shoulder'][0] and
                row['biacromialbreadth'] <= measurments['Shoulder'][1] and
                row['chestcircumference'] >= measurments['Chest'][0] and
                row['chestcircumference'] <= measurments['Chest'][1]):
                possible_sizes.append(size)
       
        if len(possible_sizes) == 1:
            matches[possible_sizes[0]] += 1
        elif len(possible_sizes) > 1:
            # Check if sizes are adjacent
            are_adjacent = all([abs(size_ordered.index(possible_sizes[i]) - size_ordered.index(possible_sizes[i+1])) == 1
                                for i in range(len(possible_sizes)-1)])
           
            if are_adjacent:
                # Assign the larger size
                larger_size = max(possible_sizes, key=lambda s: size_ordered.index(s))
                matches[larger_size] += 1
            else:
                ties += 1
       
    return matches, ties

female_matches, female_ties = get_size(female, female_sizes)
male_matches, male_ties = get_size(male, male_sizes)

print(f'Female matches: \n {female_matches} \nTies:  {female_ties}\n')
print(f'Male matches: \n {male_matches} \nTies:  {male_ties}')

Female matches: 
 {'XS': 23, 'S': 185, 'M': 247, 'L': 276, 'XL': 118, '2XL': 35, '3XL': 13} 
Ties:  0

Male matches: 
 {'XS': 63, 'S': 428, 'M': 578, 'L': 593, 'XL': 331, '2XL': 101, '3XL': 50} 
Ties:  0
