## Read Dataset

In [82]:
import pandas as pd

df = pd.read_csv('basketball_data.csv')

### a) Of the players who never had a year with zero rebounds, which one scored the smallest number of total points across the five years.

In [86]:
### get players with no rebouds
players_no_rebound = df[df['Rebounds'] > 0]
#### group the new dataset by player
grouped_df = players_no_rebound.groupby(['Player']).agg({'Points Scored':'sum','Rebounds':'sum'})
grouped_df = grouped_df.sort_values(by=['Points Scored'])
print(grouped_df)
print('\nPlayer with lowest point: {}'.format(grouped_df.iloc[0]))

                    Points Scored  Rebounds
Player                                     
John Stockton                   3         1
Clyde Drexler                  15         7
Larry Bird                     39         8
Magic Johnson                  42         7
Scottie Pippen                 96        19
Chris Mullin                  104        34
Patrick Ewing                 109        46
Michael Jordan                133        53
Charles Barkley               165        36
David Robinson                171        60
Christian Laettner            223        47
Karl Malone                   303       184

Player with lowest point: Points Scored    3
Rebounds         1
Name: John Stockton, dtype: int64


### b)   Are there any players who scored more points in each successive season? I.e., scored more points in 1993 than they did in 1992 and more points in 1994 than they did in 1993, etc. If not, are there any players who kept up such a streak for any group of four consecutive seasons?

In [135]:
### get a list of players
players = list((df.groupby(['Player']).agg({'Points Scored':'sum','Rebounds':'sum'})).index)

# get streaks
def get_streak(x):
    data = df[df['Player'] == x]
    data = data.sort_values(by=['Year'])

    count = -1
    prev = 0
    streak = 0
    for index, row in data.iterrows():   
        points = row['Points Scored']
        if points > prev:
            count += 1
        else:
            if count > streak:
                streak = count
            count = 0
        prev = points
    if count > streak:
        streak = count
    return streak

df['Streak'] = df.Player.apply(get_streak)
data = df.groupby(['Player', 'Streak']).agg({'Points Scored':'sum','Rebounds':'sum'})
data = data.reset_index()
data = data.sort_values(by=['Streak'], ascending=False)
print(data)  

                Player  Streak  Points Scored  Rebounds
11      Scottie Pippen       3             98        19
2   Christian Laettner       2            227        47
4       David Robinson       2            187        60
6          Karl Malone       2            303       184
10       Patrick Ewing       2            144        46
0      Charles Barkley       1            165        36
1         Chris Mullin       1            104        34
3        Clyde Drexler       1             44         7
5        John Stockton       1             10         1
7           Larry Bird       1             76         8
8        Magic Johnson       1             53         7
9       Michael Jordan       1            190        53


### c)   If we consider each player's lowest number of rebounds, which player had the largest such minimum?

In [62]:
data = df.groupby(['Player']).agg({'Rebounds':'sum'})
data = data.sort_values(by=['Rebounds'])
print(data)

print('\nPlayer with lowest Rebounds Count: {}'.format(data.iloc[0]))

                    Rebounds
Player                      
John Stockton              1
Clyde Drexler              7
Magic Johnson              7
Larry Bird                 8
Scottie Pippen            19
Chris Mullin              34
Charles Barkley           36
Patrick Ewing             46
Christian Laettner        47
Michael Jordan            53
David Robinson            60
Karl Malone              184

Player with lowest Rebounds Count: Rebounds    1
Name: John Stockton, dtype: int64


### d) Of the players who have a double-letter in their name (two consecutive letters that are the same), which one scored the least total points. (Must use a regular expression to locate players with two consecutive letters in any part of their name.)

In [72]:
import re
def has_double(x):
    r'((\w)\2)+'
    m = [m.group() for m in re.finditer(r'((\w)\2)+', x)]
    return len(m) > 0

df['has double'] = df.Player.apply(has_double)
data = df[df['has double'] == True]
data = data.groupby(['Player']).agg({'Points Scored':sum})
data = data.sort_values(by=['Points Scored'])
print(data)
print('\n Player with least points: {}'.format(data.iloc[0]))

                    Points Scored
Player                           
Larry Bird                     76
Scottie Pippen                 98
Chris Mullin                  104
Christian Laettner            227

 Player with least points: Points Scored    76
Name: Larry Bird, dtype: int64


### e)   Calculate each player's combined FT% across the five years. Note that the FT% from individual years cannot be combined directly, since they are based on different amounts of attempts.

In [76]:
data = df.groupby(['Player']).agg({'FTM':'sum', 'FTA':'sum'})
percentages = []
for index, row in data.iterrows():
    fta = row['FTA']
    ftm = row['FTM']
    percentages.append(ftm/fta)
data['FT%'] = percentages
print(data)

Unnamed: 0_level_0,FTM,FTA,FT%
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Charles Barkley,73,108,0.675926
Chris Mullin,49,65,0.753846
Christian Laettner,81,97,0.835052
Clyde Drexler,16,46,0.347826
David Robinson,100,139,0.719424
John Stockton,6,15,0.4
Karl Malone,117,160,0.73125
Larry Bird,34,45,0.755556
Magic Johnson,28,38,0.736842
Michael Jordan,62,94,0.659574


### f)    Create new column called PMR, which stands for "Points minus rebounds." Sum the PMR for each player across all five years and sort the dataframe. Display at least the top five players according to this measure.

In [81]:
data = df.groupby(['Player']).agg({'Points Scored':'sum', 'Rebounds':'sum'})
pmr = []
for index, row in data.iterrows():
    points = row['Points Scored']
    rebounds = row['Rebounds']
    pmr.append(points-rebounds)

data['PMR'] = pmr
data = data.sort_values(by=['PMR'], ascending=False)
print(data.head())

                    Points Scored  Rebounds  PMR
Player                                          
Christian Laettner            227        47  180
Michael Jordan                190        53  137
Charles Barkley               165        36  129
David Robinson                187        60  127
Karl Malone                   303       184  119
