In [2]:
import pandas as pd
import glob
import os
import pathlib
import re
import unicodedata

In [37]:
#FUNCTIONS

#=====parsing strings for height calculations=====
def height_parser(str_in):
    height = re.findall(r'(\d{1,3}(?:\.\d+)?)', str_in)
    h_num = [float(i) for i in height]
    conversion_unit = 12 #inches
    inches = (h_num[0]*conversion_unit) + h_num[1]
    return inches

#=====standardize names=====
def clean_names(str_in):
    str_out = (unicodedata.normalize('NFKD', str_in)
    .encode('ascii','ignore')
    .decode('utf-8') 
    .replace("'","") #e.g., o'donnel
    .replace("  "," ") #just in case
    .strip().lower())
    return str_out

#test 0027 for clean_names
#asdf = pd.read_csv('00-raw/Wingspan_Raw/wingspan_crafted_2026.csv')
#asdf['the-name'] = asdf['the-name'].apply(clean_names)
#print(asdf['the-name'].iloc[188:192])

#=====create new columns=====
def proc_aio(df_in):
    columns_old = ['height','wingspan','relative']
    columns_new = ['height-inches', 'wingspan_inches', 'relative']
    for old, new in zip(columns_old[:2], columns_new[:2]):
        if new not in df_in.columns:
            df_in[new] = df_in[old].apply(height_parser)
            print(f'did {new}')
    if columns_new[2] not in df_in.columns: #the column_new/old names need to be replaced
        df_in[columns[2]] = df_in[columns[1]] - df_in[columns[0]]
        print(f'did {columns[1]}/{df_in.columns[1]} - {columns[0]}/{df_in.columns[0]} to calculate \
        {columns[2]}/{df_in.columns[2]}')
    df_in = df_in[['name','height-inches', 'wingspan_inches', 'relative']]
    return df_in
    
    #leftovers
    #df['height_inches'] = df['height'].apply(height_parser)
    #df['wingspan_inches'] = df['wingspan'].apply(height_parser)
    #df['relative'] = df['wingspan_inches'] - df['height_inches']

In [38]:
#testing proc_aio
file_name = 'wingspan_crafted_2026'
df = pd.read_csv(f'00-raw/Wingspan_Raw/{file_name}.csv')
names = {
    'the-name':'name',
    'tablescraper-selected-row':'height',
    'tablescraper-selected-row 2':'wingspan',
    'tablescraper-selected-row 3':'relative'
}

df = df.rename(columns=names)
df = df[['name','height','wingspan','relative']]
df = df.dropna(axis='index', how='any')

df = proc_aio(df)

df.head()

did height-inches
did wingspan_inches


Unnamed: 0,name,height-inches,wingspan_inches,relative
0,Mo Bamba,83.25,94.0,10.75
1,Jalen Williams,76.5,86.25,9.75
2,Isaiah Stewart,79.25,88.75,9.5
3,Robert Williams III,80.0,89.5,9.5
4,Cedric Coward,77.25,86.25,9.0


In [39]:
subdirectory = '00-raw/Wingspan_Raw/wingspan_nbaorg'
target_directory = pathlib.Path.cwd() / subdirectory
file_list = list(target_directory.glob('*.csv'))

print(f'datasets merged: {len(file_list)}')

df = pd.DataFrame()

for file in file_list:
    add_df = pd.read_csv(file)
    df = pd.concat([df, add_df], ignore_index = True)

print(df.shape)

datasets merged: 20
(1313, 10)


In [40]:
#rename columns: need name, height, wingspan - and for later, way to 
df.columns = df.columns.str.strip()

names = {
    'Crom_text__NpR1_':'name',
    'tablescraper-selected-row 4':'height',
    'tablescraper-selected-row 8':'wingspan'
}

df = df.rename(columns=names)
df = df[['name','height','wingspan']]
df = df.dropna(axis='index', how='any')

df.head(1)

Unnamed: 0,name,height,wingspan
0,Quincy Acy,6' 6.5'',7' 2.75''


In [41]:
df['name'] = df['name'].apply(clean_names) 
df['height_inches'] = df['height'].apply(height_parser)
df['wingspan_inches'] = df['wingspan'].apply(height_parser)
df['relative'] = df['wingspan_inches'] - df['height_inches']

df=df[['name','height_inches','wingspan_inches','relative']]

nbaorg_combined = df
#df.to_csv('01-interim/wingspan_cleaned/nbaorg_combined_cleaned.csv')

In [42]:
#========= CLEANING 2025 WINGSPAN =========

In [47]:
file_name = 'wingspan_crafted_2025'
df = pd.read_csv(f'00-raw/Wingspan_Raw/{file_name}.csv')

df['name'] = df['name'].apply(clean_names) 
df['relative'] = df['wingspan_inches'] - df['height_inches']
df=df[['name','height_inches','wingspan_inches','relative']]

df = df.dropna(axis='index', how='any')



wingspan_crafted_2025 = df
wingspan_crafted_2025.head(1)

df.to_csv(f'01-interim/wingspan_cleaned/{file_name}_cleaned.csv')

In [44]:
#========= CLEANING 2026 WINGSPAN =========
file_name = 'wingspan_crafted_2026'
df = pd.read_csv(f'00-raw/Wingspan_Raw/{file_name}.csv')
names = {
    'the-name':'name',
    'tablescraper-selected-row':'height',
    'tablescraper-selected-row 2':'wingspan',
    'tablescraper-selected-row 3':'relative'
}

df = df.rename(columns=names)
df = df[['name','height','wingspan','relative']]
df = df.dropna(axis='index', how='any')

#display(raw_df)
#display(df)

In [45]:
df['height_inches'] = df['height'].apply(height_parser)
df['wingspan_inches'] = df['wingspan'].apply(height_parser)
df=df[['name','height_inches','wingspan_inches','relative']]

wingspan_crafted_2026 = df
wingspan_crafted_2026.head(1)


#df.to_csv(f'01-interim/wingspan_cleaned/{file_name}_cleaned.csv')

Unnamed: 0,name,height_inches,wingspan_inches,relative
0,Mo Bamba,83.25,94.0,10.75


In [46]:
#WINGSPAN_KAGGLE
file_name = 'wingspan_Kaggle'
df = pd.read_csv(f'00-raw/Wingspan_Raw/{file_name}.csv')
names = {
    'PLAYER':'name',
    'HGT':'height_inches',
    'WNGSPN':'wingspan_inches'
}

df = df.rename(columns=names)
df = df[['name','height_inches','wingspan_inches']]
df['relative'] = df['wingspan_inches'] - df['height_inches']
df = df.dropna(axis='index', how='any')

wingspan_Kaggle = df
wingspan_Kaggle.head(1)
#df.to_csv(f'01-interim/wingspan_cleaned/{file_name}_cleaned.csv')

Unnamed: 0,name,height_inches,wingspan_inches,relative
0,"Almansa, Izan",81.25,85.75,4.5


In [11]:
#=========== FINISHED INDIVIDUAL PROC, NOW MERGE ==============

In [12]:
list_source = [nbaorg_combined,
               wingspan_crafted_2025,
               wingspan_crafted_2026,
               wingspan_Kaggle]


df_merge = pd.DataFrame()

for f in list_source:
    df_merge = df_merge.combine_first(f)

df_merge=df_merge[['name','height_inches','wingspan_inches','relative']]
print(df_merge)

df_merge.to_csv(f'01-interim/wingspan_cleaned/combined_cleaned.csv')



'''print(f'datasets merged: {len(file_list)}')

df = pd.DataFrame()

for file in file_list:
    add_df = pd.read_csv(file)
    df = pd.concat([df, add_df], ignore_index = True)

print(df.shape)'''

                   name  height_inches  wingspan_inches  relative
0            Quincy Acy          78.50            86.75      8.25
1       Harrison Barnes          79.00            83.25      4.25
2           Will Barton          77.00            81.75      4.75
3          Bradley Beal          75.25            80.00      4.75
4         J'Covan Brown          73.00            77.25      4.25
...                 ...            ...              ...       ...
1673        Smith, Mike          78.50            80.50      2.00
1674  Stephens, Jarrett          77.25            81.25      4.00
1675   Thornton, Bootsy          75.50            78.50      3.00
1676      Walls, Jaquay          73.25            74.50      1.25
1677    Watkins, Jameel          81.50            90.00      8.50

[1676 rows x 4 columns]


"print(f'datasets merged: {len(file_list)}')\n\ndf = pd.DataFrame()\n\nfor file in file_list:\n    add_df = pd.read_csv(file)\n    df = pd.concat([df, add_df], ignore_index = True)\n\nprint(df.shape)"