In [1]:
import pandas as pd
import numpy as np

In [46]:
# Importation des 2 DataFrame
df = pd.read_csv('NBA Shot Locations 1997 - 2020.csv')
rank = pd.read_csv('ranking.csv.zip')

In [47]:
def preparation_shot_location(df,annee):
    # Récupération des 10 meilleurs joueurs
    espn = pd.read_csv('top_espn_actif.csv')
    # On restreint la DataFrame aux 10 meilleurs joueurs
    new_df = df.loc[df['Player Name'].isin(espn['player'])]
    # On met la colonne Game Date en type datetime
    new_df['Game Date']=pd.to_datetime(new_df['Game Date'],format = '%Y%m%d')
    # On restreint à partir de l'année 2010
    new_df= new_df.loc[new_df['Game Date'].dt.year >=annee]
    # On crée la variable Seconds
    new_df['Seconds'] = new_df['Minutes Remaining']*60 + new_df['Seconds Remaining']
    return new_df

In [48]:
def preparation_ranking(ranking,annee):
    # On modifie le type de la date pour correspondre avec l'autre DataFrame
    ranking['STANDINGSDATE']=pd.to_datetime(ranking['STANDINGSDATE'],format = '%Y-%m-%d')
    # On se restreint à au delà de 2010
    ranking = ranking.loc[rank['STANDINGSDATE'].dt.year >= annee]
    return ranking

In [49]:
# On crée une fonction qui prend en entrée un type d'action 'a' et retourne le taux de réussite associé.
def find_val(df_tx_reussite,a):
    return df_tx_reussite.loc[df_tx_reussite['Action Type'] == a]['Shot Made Flag'].values[0]

# On crée une fonction qui à une valeur de taux de réussite donnée associe sa note
def find_n(val,liste):
    n = 0
    for i in liste:
        if val >= i:
            n+=1
    return n

# Cette fonction crée la colonne difficulté. Pour cela, elle prend en entrée un entier n, elle parcourt toutes les lignes de 
# notre DataFrame. Pour chaque ligne elle récupère le taux de réussite de l'Action Type et lui associe sa note.
def categ(df_tx_reussite,df,n):
    quantile = list(df_tx_reussite['Shot Made Flag'].quantile(np.linspace(1/n,1,n)))
    cat = []
    for a in df['Action Type']:
        val = find_n(find_val(df_tx_reussite,a),quantile)
        cat.append(val)
    return cat

# Création de la variable shot_difficulty
def shot_difficulty(df,n):
    # On crée la DataFrame qui à chaque type de shoot associe le taux de réussite. (70 types de shoot)
    df_tx_reussite = pd.DataFrame(df.groupby('Action Type')['Shot Made Flag'].mean().reset_index())
    # On trouve les quantiles d'ordre n pour ces 70 valeurs.
    quantile = list(df_tx_reussite['Shot Made Flag'].quantile(np.linspace(1/n,1,n)))
    difficulty = categ(df_tx_reussite,df,n)
    df['Shot Difficulty'] = difficulty
    return df

In [50]:
# Cette fonction associe à chaque shot l'équipe adverse du shooter.
def adversaire(df):
    adv = []
    new_df = df[['Team Name','Home Team','Away Team']]
    for a in new_df.values:
        if a[0] == a[1]:
            adv.append(a[2])
        else:
            adv.append(a[1])
    return adv

def creation_pourcentage_adversaire(df,rank):
    # On remplace le nom des équipes de la colonne Team Name par leur code en 3 lettres
    new_df = df.replace(to_replace = ['New Orleans Hornets', 'Oklahoma City Thunder',
       'Golden State Warriors', 'Cleveland Cavaliers', 'Miami Heat',
       'Los Angeles Clippers', 'San Antonio Spurs', 'Houston Rockets',
       'Portland Trail Blazers', 'New Orleans Pelicans',
       'Milwaukee Bucks', 'LA Clippers', 'Toronto Raptors','New Orleans/Oklahoma City Hornets',
       'Los Angeles Lakers','Seattle SuperSonics'],
           value = ['NOP', 'OKC','GSW','CLE','MIA','LAC','SAS','HOU','POR','NOP','MIL','LAC','TOR','NOK','LAL','SEA'])
    # On remplace NOH par NOP car l'équipe de la Nouvelle Orléans a changé de nom. Mais dans la dataframe ranking, elle est
    # toujours appelé New Orleans tout court..
    new_df = new_df.replace(to_replace = ['NOH'],value=['NOP'])
    # On applique la fonction adversaire qui crée une colonne avec l'équipe adverse pour chaque shoot.
    new_df['Adversaire'] = adversaire(new_df)
    # On remplace les noms de la colonne TEAM de ranking pour que cela corresponde avec les valeurs de la colonne 'Adversaire'.
    new_rank=rank.replace(to_replace = ['Denver', 'Memphis', 'New Orleans', 'Phoenix', 'LA Clippers',
       'Sacramento', 'Utah', 'Portland', 'Dallas', 'Minnesota',
       'Golden State', 'Oklahoma City', 'L.A. Lakers', 'San Antonio',
       'Houston', 'Milwaukee', 'Boston', 'Cleveland', 'Brooklyn',
       'Philadelphia', 'New York', 'Atlanta', 'Indiana', 'Miami',
       'Toronto', 'Chicago', 'Orlando', 'Washington', 'Charlotte',
       'Detroit', 'L.A. Clippers','New Jersey','Seattle', 'New Orleans/Oklahoma City',], 
                          value = ['DEN','MEM','NOP','PHX','LAC','SAC','UTA','POR',
                                                          'DAL','MIN','GSW','OKC','LAL','SAS','HOU','MIL',
                                                          'BOS','CLE','BKN','PHI','NYK','ATL','IND','MIA',
                                                          'TOR','CHI','ORL','WAS','CHA','DET','LAC','NJN','SEA','NOK'])
    # On joint les 2 dataframe sur la date et l'équipe adverse par une jointure gauche pour conserver tous les shoots.
    final_df = new_df.merge(new_rank,how = 'left',right_on = ['STANDINGSDATE','TEAM'],left_on = ['Game Date','Adversaire'])
    # On applique une correction à la colonne pourcentage de victoire pour gérer les valeurs extrêmes du début de saison
    final_df['W_PCT_2'] = [a if a>0.2 else final_df['W_PCT'].mean() for a in final_df['W_PCT']]
    # On enlève la colonne avec beaucoup de NaN et l'ancienne colonne 'W_PCT'
    final_df = final_df.drop(['RETURNTOPLAY','W_PCT'],axis = 1)
    return final_df

In [51]:
def get_dummies(df,colonne):
    dummies = pd.get_dummies(df[colonne])
    new_df = pd.concat([df,dummies],axis=1)
    new_df=new_df.drop([colonne],axis=1)
    return new_df

In [52]:
def final_df(df,rank,n,annee):
    shot_loc = preparation_shot_location(df,annee)
    ranking = preparation_ranking(rank,annee)
    shot_loc_with_shot_difficulty = shot_difficulty(shot_loc,n)
    new_df = creation_pourcentage_adversaire(shot_loc_with_shot_difficulty,ranking)
    final_df = get_dummies(new_df,'Player Name')
    last_minute = [1 if a<60 else 0 for a in final_df.Seconds]
    final_df['Last Minute'] = last_minute
    dff = final_df.loc[:,['Last Minute','Shot Distance','Shot Difficulty','W_PCT_2','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']]
    return dff

In [53]:
def selection_colonne(df,columns):
    return df.loc[:,columns]

In [54]:
all_columns = ['Last Minute','Shot Distance','Shot Difficulty','W_PCT_2','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
columns_w_pct = ['Last Minute','Shot Distance','Shot Difficulty','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
columns_shot_diff =['Last Minute','Shot Distance','W_PCT_2','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
columns_distance = ['Last Minute','Shot Difficulty','W_PCT_2','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
columns_last_minute = ['Shot Distance','Shot Difficulty','W_PCT_2','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
only_dist = ['Shot Distance','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
only_dist_diff = ['Shot Distance','Shot Difficulty','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
only_diff = ['Shot Difficulty','Shot Made Flag','Damian Lillard','LeBron James',
                         'Kevin Durant','Chris Paul','Russell Westbrook','James Harden','Anthony Davis','Giannis Antetokounmpo',
                         'Kawhi Leonard','Stephen Curry']
without_players = ['Last Minute','Shot Distance','Shot Difficulty','W_PCT_2','Shot Made Flag']

In [55]:
test=final_df(df,rank,20,2010)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Game Date']=pd.to_datetime(new_df['Game Date'],format = '%Y%m%d')


In [56]:
test.head()

Unnamed: 0,Last Minute,Shot Distance,Shot Difficulty,W_PCT_2,Shot Made Flag,Damian Lillard,LeBron James,Kevin Durant,Chris Paul,Russell Westbrook,James Harden,Anthony Davis,Giannis Antetokounmpo,Kawhi Leonard,Stephen Curry
0,0,0,1,0.588,1,0,0,0,1,0,0,0,0,0,0
1,0,10,9,0.588,1,0,0,0,1,0,0,0,0,0,0
2,0,7,6,0.588,0,0,0,0,1,0,0,0,0,0,0
3,0,12,2,0.588,1,0,0,0,1,0,0,0,0,0,0
4,0,7,0,0.588,0,0,0,0,1,0,0,0,0,0,0
