### Similarity function

In [1]:
import pandas as pd
from scipy import spatial

In [2]:
cbb21 = pd.read_csv('CSV_Files/cbb_21.csv', index_col = 0)
s16_team_avg = pd.read_csv('CSV_Files/avg_by_round.csv', index_col = 0)
s16_teams = pd.read_csv('CSV_Files/cbb_s16.csv', index_col = 0)

In [11]:
# checking the teams most correlated similarity to a seed

def full_similarity(df1, df2):
    final_results = []
    for x in df1.TEAM:
        max_similarity = 0
        current = []
        
        for y in df2.POSTSEASON:
        
            similarity = 1/(spatial.distance.euclidean(df1[df1.TEAM == x].drop(columns = ['TEAM', 'SEED', 'BARTHAG']), 
                                         df2[df2.POSTSEASON == y].drop(columns = ['POSTSEASON', 'BARTHAG'])))
            if similarity > max_similarity:
                max_similarity = similarity
                current_team = x
                team_seed = df1[df1.TEAM == x]['SEED'].unique()[0]
                current_postseason_rank = y
        current.append(current_team)
        current.append(int(team_seed))
        current.append(max_similarity)
        current.append(current_postseason_rank)
        final_results.append(current)
    
    return final_results
        


In [12]:
Teams_by_avg_postseason = pd.DataFrame(full_similarity(cbb21, s16_team_avg), 
                            columns = ['Team', 'Team_Seed', 'Similarity_Score', 'Postseason_Finish'])


In [34]:
# checking each teams similarity to all the different seeds

def full_similarity2(df1, df2):
    final_results = []
    
    for x in df1.TEAM:
        max_similarity = 0
        
        for y in df2.POSTSEASON:
            similarity = 1/(spatial.distance.euclidean(df1[df1.TEAM == x].drop(columns = ['TEAM', 'SEED', 'BARTHAG']), 
                                         df2[df2.POSTSEASON == y].drop(columns = ['POSTSEASON', 'BARTHAG'])))
            if similarity > max_similarity:
                max_similarity = similarity
                current_team = x
                current_postseason_rank = y
            final_results.append(x + ', ' + str(similarity) + ', ' + str(y))
    
    return final_results

In [35]:
full_similarity2(cbb21, s16_team_avg)

['Michigan, 0.043987484580258196, 1',
 'Michigan, 0.05588801284388184, 2',
 'Michigan, 0.0650472042346112, 3',
 'Michigan, 0.0668874333163471, 4',
 'Michigan, 0.0676875393641978, 5',
 'Michigan, 0.08283378607558742, 6',
 'Michigan, 0.07478729356911597, 7',
 'Baylor, 0.04038615747763096, 1',
 'Baylor, 0.050156915661401784, 2',
 'Baylor, 0.056605274629342596, 3',
 'Baylor, 0.05921019611120142, 4',
 'Baylor, 0.058840887942623946, 5',
 'Baylor, 0.058008958580333725, 6',
 'Baylor, 0.06785475116317397, 7',
 'Illinois, 0.0493788669658046, 1',
 'Illinois, 0.06642409318252683, 2',
 'Illinois, 0.08033919559193115, 3',
 'Illinois, 0.09451904535316781, 4',
 'Illinois, 0.07801930406142199, 5',
 'Illinois, 0.09633694903854474, 6',
 'Illinois, 0.08483159867654012, 7',
 'Gonzaga, 0.03292199572600831, 1',
 'Gonzaga, 0.03905154987971082, 2',
 'Gonzaga, 0.043806926001190065, 3',
 'Gonzaga, 0.04927416996966458, 4',
 'Gonzaga, 0.04387341645254749, 5',
 'Gonzaga, 0.05394916269067185, 6',
 'Gonzaga, 0.055102

### Expanded for comparison to all the s16 teams

In [3]:
s16_teams_indexed = s16_teams.reset_index()

In [4]:
# checking the teams most correlated similarity to a team historically

def full_similarity_all(df1, df2):
    final_results = []
    
    for x in df1.TEAM:
        current = []
        max_similarity = 0
        
        for y in df2.index:
            similarity = 1/(spatial.distance.cosine(df1[df1.TEAM == x].drop(columns = ['TEAM', 'SEED', 'BARTHAG']), 
                                         df2[df2.index == y].drop(columns = ['index', 'TEAM', 'SEED', 'POSTSEASON', 'BARTHAG', 'YEAR'])))
            
            if similarity > max_similarity:
                max_similarity = similarity
                current_team = x
                team_seed = df1[df1.TEAM == x]['SEED'].unique()[0]
                current_postseason_rank = df2[df2.index == y]['POSTSEASON'].unique()[0]
                current_compared_team = df2[df2.index == y]['TEAM'].unique()[0]
        current.append(current_team)
        current.append(int(team_seed))
        current.append(max_similarity)
        current.append(current_compared_team)
        current.append(current_postseason_rank)
        final_results.append(current)
        
        
    return final_results
        

In [5]:
Team_comparison = pd.DataFrame(full_similarity_all(cbb21, s16_teams_indexed), 
                              columns = ['Active_Team', 'Team_Seed', 'Similarity_Score', 
                                         'Best_Comparison_Team', 'Postseason_Finish'])


In [17]:
Teams_by_avg_postseason.to_csv('CSV_Files/Teams_by_avg_postseason.csv')
Team_comparison.to_csv('CSV_Files/Team_comparison.csv')