In [5]:
import pandas as pd
from scipy.spatial.distance import euclidean, pdist, squareform

In [6]:
cbb21 = pd.read_csv('cbb21.csv')
s16_team_avg = pd.read_csv('avg_by_round.csv', index_col = 0)
s16_teams = pd.read_csv('cbb_s16.csv', index_col = 0)

## Working to get all 3 df's to have the same names for their columns, some will have slight differences with which columns they have but thats fine

#### First i am just cutting out all the unnecessary columns in some of the dfs

In [7]:
# limiting to only the teams who made the tournament

cbb21 = cbb21[cbb21.SEED > 0]

In [8]:
# removing the unncessary columns for comparison

cbb_21 = cbb21.drop(columns = ['CONF', 'G', 'W'])

s16_teams = s16_teams.drop(columns = ['CONF', 'G', 'W'])

#### Now i am changing the names of the columns back to the original for clarity and to match up the 3 dfs

In [9]:
# renaming the columns to match for all teams

def renaming_columns(df):
    
    new_columns = []

    for x in df.columns:
        if x == 'POSTSEASON':
            new_columns.append(x)
        elif x[4:7] == 'two':
            new_columns.append('2P' + '_' + (x[-2:-1]).upper())
        elif x[4:7] == 'thr':
            new_columns.append('3P' + '_' + (x[-2:-1]).upper())
        elif x[-3] == '_':
            new_columns.append(x[4:7].upper() + '_' + x[8].upper())
        else:
            new_columns.append(x[4:-1].upper())
            
    df.columns = new_columns
    
    return df

In [10]:
renaming_columns(s16_team_avg)

Unnamed: 0,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON
0,108.716964,98.535714,0.724843,51.653571,48.164732,17.849107,18.860268,31.010714,28.989286,37.592857,34.090625,50.590179,47.035714,35.729911,33.515625,67.713393,-0.821429,1
1,112.825893,95.05625,0.862393,52.138393,47.260714,17.228571,19.289286,32.180357,28.955357,36.533929,33.565179,51.258929,46.113393,35.81875,32.932143,67.333929,3.038393,2
2,115.514286,93.75,0.908811,52.621429,47.2,17.232143,18.996429,33.017857,28.721429,36.342857,31.828571,51.617857,45.908929,36.344643,33.139286,67.560714,4.589286,3
3,117.742857,92.810714,0.93,53.685714,46.710714,16.814286,19.128571,33.15,28.814286,37.325,31.810714,53.085714,45.435714,36.560714,32.725,67.414286,6.575,4
4,116.371429,91.957143,0.929971,52.935714,45.964286,17.514286,19.771429,32.95,29.885714,35.835714,31.678571,51.192857,45.171429,37.114286,31.728571,66.664286,6.1,5
5,119.785714,91.471429,0.952457,53.7,45.942857,15.485714,18.314286,32.885714,27.514286,35.9,29.4,53.7,44.357143,35.742857,32.914286,66.657143,7.371429,6
6,121.3,90.4,0.963671,54.457143,46.271429,16.3,20.014286,33.414286,29.028571,35.0,29.871429,53.528571,45.285714,37.428571,32.085714,66.871429,9.057143,7


In [11]:
s16_teams.columns = ['TEAM', 'ADJOE', 'ADJDE', 'BARTHAG', 'EFG_O', 'EFG_D',
                   'TOR', 'TORD', 'ORB', 'DRB', 'FTR', 'FTRD', '2P_O', '2P_D',
                   '3P_O', '3P_D', 'ADJ_T', 'WAB', 'POSTSEASON', 'SEED', 'YEAR']

### Checking to make sure everything is equivilent

In [12]:
s16_teams

Unnamed: 0,TEAM,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,...,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON,SEED,YEAR
0,North Carolina,123.3,94.9,0.9531,52.6,48.1,15.4,18.2,40.7,30.0,...,30.4,53.9,44.6,32.7,36.2,71.7,8.6,6,1.0,2016
1,Wisconsin,129.1,93.6,0.9758,54.8,47.7,12.4,15.8,32.1,23.7,...,22.4,54.8,44.7,36.5,37.5,59.3,11.3,6,1.0,2015
2,Michigan,114.4,90.4,0.9375,53.9,47.7,14.0,19.5,25.5,24.9,...,30.0,54.7,46.8,35.2,33.2,65.9,6.9,6,3.0,2018
3,Texas Tech,115.2,85.2,0.9696,53.5,43.0,17.7,22.8,27.4,28.7,...,36.6,52.8,41.9,36.5,29.7,67.5,7.0,6,3.0,2019
4,Gonzaga,117.8,86.3,0.9728,56.6,41.1,16.2,17.1,30.0,26.2,...,26.9,56.3,40.0,38.2,29.0,71.5,7.7,6,1.0,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,Michigan St.,111.4,87.8,0.9392,50.6,44.5,20.8,19.2,36.1,27.6,...,32.4,50.4,44.3,34.1,30.1,64.4,6.7,3,3.0,2013
108,Arizona,114.4,92.2,0.9229,52.5,46.6,19.5,19.8,35.0,26.7,...,32.9,50.6,43.4,37.1,35.8,66.8,4.6,3,6.0,2013
109,Oregon,104.8,88.6,0.8728,49.3,46.4,21.4,22.0,35.8,27.2,...,33.3,49.1,44.9,33.3,33.4,69.2,2.9,3,12.0,2013
110,La Salle,112.0,96.2,0.8516,51.9,49.3,17.1,21.3,29.0,34.2,...,28.5,49.3,50.6,37.7,30.2,66.0,0.3,3,13.0,2013


In [13]:
cbb_21

Unnamed: 0,TEAM,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,SEED
0,Michigan,118.1,91.1,0.9521,54.9,44.9,16.3,15.1,29.4,24.8,28.9,24.5,53.3,42.3,38.7,33.5,66.9,7.2,1.0
1,Baylor,123.2,94.5,0.9548,57.5,49.1,17.6,24.6,37.5,30.9,27.0,31.7,54.1,48.1,41.8,34.0,68.8,6.6,1.0
2,Illinois,117.7,90.4,0.9539,55.6,46.6,18.2,16.1,33.0,22.2,39.2,30.5,55.3,45.4,37.6,32.7,70.7,8.9,1.0
3,Gonzaga,125.4,89.8,0.9791,61.0,47.5,16.1,20.3,30.4,23.4,36.7,25.9,64.0,46.8,36.5,32.5,74.6,8.5,1.0
4,Iowa,123.5,95.7,0.9491,54.6,48.3,13.3,16.3,30.7,28.6,32.0,22.6,52.4,45.8,38.6,34.8,70.0,5.6,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,Mount St. Mary's,95.7,100.6,0.3593,48.7,45.5,20.1,16.3,31.7,24.6,29.7,24.6,47.7,45.6,33.7,30.1,62.2,-7.2,16.0
64,Hartford,97.3,99.6,0.4345,50.4,47.6,19.0,20.4,24.8,28.6,28.6,25.0,51.3,50.7,32.7,28.7,66.9,-4.9,16.0
65,Norfolk St.,99.4,104.1,0.3698,49.7,47.4,17.8,20.7,28.8,28.7,38.6,42.0,46.1,47.2,37.4,31.7,67.9,-5.0,16.0
66,Texas Southern,97.3,103.5,0.3307,48.3,46.5,21.4,19.2,32.9,25.7,38.9,33.8,51.0,45.2,27.7,32.4,71.8,-4.5,16.0


In [14]:
s16_team_avg

Unnamed: 0,ADJOE,ADJDE,BARTHAG,EFG_O,EFG_D,TOR,TORD,ORB,DRB,FTR,FTRD,2P_O,2P_D,3P_O,3P_D,ADJ_T,WAB,POSTSEASON
0,108.716964,98.535714,0.724843,51.653571,48.164732,17.849107,18.860268,31.010714,28.989286,37.592857,34.090625,50.590179,47.035714,35.729911,33.515625,67.713393,-0.821429,1
1,112.825893,95.05625,0.862393,52.138393,47.260714,17.228571,19.289286,32.180357,28.955357,36.533929,33.565179,51.258929,46.113393,35.81875,32.932143,67.333929,3.038393,2
2,115.514286,93.75,0.908811,52.621429,47.2,17.232143,18.996429,33.017857,28.721429,36.342857,31.828571,51.617857,45.908929,36.344643,33.139286,67.560714,4.589286,3
3,117.742857,92.810714,0.93,53.685714,46.710714,16.814286,19.128571,33.15,28.814286,37.325,31.810714,53.085714,45.435714,36.560714,32.725,67.414286,6.575,4
4,116.371429,91.957143,0.929971,52.935714,45.964286,17.514286,19.771429,32.95,29.885714,35.835714,31.678571,51.192857,45.171429,37.114286,31.728571,66.664286,6.1,5
5,119.785714,91.471429,0.952457,53.7,45.942857,15.485714,18.314286,32.885714,27.514286,35.9,29.4,53.7,44.357143,35.742857,32.914286,66.657143,7.371429,6
6,121.3,90.4,0.963671,54.457143,46.271429,16.3,20.014286,33.414286,29.028571,35.0,29.871429,53.528571,45.285714,37.428571,32.085714,66.871429,9.057143,7


In [20]:
similarity_func(s16_team_avg.iloc[2], s16_team_avg.iloc[1])

0.19512974462303373

### Trying to get a similarity function

In [15]:
def similarity_func(u, v):
    return 1/(1+euclidean(u,v))

In [27]:
def full_similarity(df1, df2):
    final_results = []
    
    for x in df1.TEAM:
        max_similarity = 0
        
        for y in df2.POSTSEASON:
            similarity = similarity_func(df1[df1.TEAM == x].drop(columns = ['TEAM', 'SEED', 'BARTHAG']), 
                                         df2[df2.POSTSEASON == y].drop(columns = ['POSTSEASON', 'BARTHAG']))
            if similarity > max_similarity:
                max_similarity = similarity
                current_team = x
                current_postseason_rank = y
        final_results.append(current_team + ', ' + str(max_similarity) + ', ' + str(current_postseason_rank))
    
    return final_results
        


In [29]:
full_similarity(cbb_21, s16_team_avg)

['Michigan, 0.08283378607558742, 6',
 'Baylor, 0.06785475116317397, 7',
 'Illinois, 0.09633694903854474, 6',
 'Gonzaga, 0.055102131623696364, 7',
 'Iowa, 0.07690155097191298, 6',
 'Ohio St., 0.094993664438396, 4',
 'Houston, 0.05960009389302947, 5',
 'Alabama, 0.0707540179364538, 5',
 'West Virginia, 0.08129172152162888, 3',
 'Texas, 0.1325309859071759, 2',
 'Kansas, 0.07482092307404015, 2',
 'Arkansas, 0.09026553399604463, 3',
 'Florida St., 0.09718434887716636, 3',
 'Virginia, 0.044455464208523296, 6',
 'Purdue, 0.09589968257833985, 3',
 'Oklahoma St., 0.10188672180569473, 2',
 'Villanova, 0.06464090420486833, 3',
 'Tennessee, 0.09576003478869904, 2',
 'Creighton, 0.061218839112477136, 6',
 'Colorado, 0.09890522366265998, 3',
 'Texas Tech, 0.10258632432644652, 2',
 'BYU, 0.07212910650829751, 3',
 'USC, 0.09058488032777308, 3',
 'San Diego St., 0.10122998704738025, 2',
 'Florida, 0.11934752489302801, 2',
 'Connecticut, 0.08551388458602512, 2',
 'Clemson, 0.05373690422074132, 2',
 'Ore

In [30]:
def full_similarity2(df1, df2):
    final_results = []
    
    for x in df1.TEAM:
        max_similarity = 0
        
        for y in df2.POSTSEASON:
            similarity = similarity_func(df1[df1.TEAM == x].drop(columns = ['TEAM', 'SEED', 'BARTHAG']), 
                                         df2[df2.POSTSEASON == y].drop(columns = ['POSTSEASON', 'BARTHAG']))
            if similarity > max_similarity:
                max_similarity = similarity
                current_team = x
                current_postseason_rank = y
            final_results.append(x + ', ' + str(similarity) + ', ' + str(y))
    
    return final_results

In [31]:
full_similarity2(cbb_21, s16_team_avg)

['Michigan, 0.043987484580258196, 1',
 'Michigan, 0.05588801284388184, 2',
 'Michigan, 0.0650472042346112, 3',
 'Michigan, 0.0668874333163471, 4',
 'Michigan, 0.0676875393641978, 5',
 'Michigan, 0.08283378607558742, 6',
 'Michigan, 0.07478729356911595, 7',
 'Baylor, 0.04038615747763096, 1',
 'Baylor, 0.050156915661401784, 2',
 'Baylor, 0.056605274629342596, 3',
 'Baylor, 0.05921019611120142, 4',
 'Baylor, 0.05884088794262393, 5',
 'Baylor, 0.058008958580333725, 6',
 'Baylor, 0.06785475116317397, 7',
 'Illinois, 0.0493788669658046, 1',
 'Illinois, 0.06642409318252684, 2',
 'Illinois, 0.08033919559193115, 3',
 'Illinois, 0.09451904535316781, 4',
 'Illinois, 0.07801930406142199, 5',
 'Illinois, 0.09633694903854474, 6',
 'Illinois, 0.08483159867654012, 7',
 'Gonzaga, 0.03292199572600831, 1',
 'Gonzaga, 0.03905154987971083, 2',
 'Gonzaga, 0.043806926001190065, 3',
 'Gonzaga, 0.04927416996966458, 4',
 'Gonzaga, 0.04387341645254749, 5',
 'Gonzaga, 0.05394916269067185, 6',
 'Gonzaga, 0.0551021