In [1]:
import requests
import pandas as pd
import os
import json
from bs4 import BeautifulSoup
import re
#from lxml.html as lh


In [2]:
def getPlayers():
    players = pd.read_csv("../DATA/nba-players-stats/players.csv")
    df_players = pd.DataFrame(players)
    df_players.drop(columns=["Unnamed: 0","birth_city","birth_state",],inplace=True)
    df_players = df_players.rename(columns={"height": "Height (m)","weight": "Weight (kg)",
                                            "collage": "College", "born" :"Born"})
    return df_players

In [3]:
df_players = getPlayers()

In [4]:
df_players.head()

Unnamed: 0,Player,Height (m),Weight (kg),College,Born
0,Curly Armstrong,180.0,77.0,Indiana University,1918.0
1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0
2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0
3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0
4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0


In [5]:
def get_player_data():
    df_player_data = pd.read_csv("../DATA/nba-players-stats/players.csv")
    
    df_player_data.drop(columns=["Unnamed: 0","height","weight"],inplace=True)
    df_player_data = df_player_data.rename(columns={"name": "Player","year_start": "NBA_Rookie",
                                                    "year_end": "NBA_Retired","birth_date": "Birth Date",
                                                    "position": "Position", "college":"College"})
    return df_player_data

In [6]:
df_player_data = get_player_data()

In [7]:
df_player_data.head()

Unnamed: 0,Player,collage,born,birth_city,birth_state
0,Curly Armstrong,Indiana University,1918.0,,
1,Cliff Barker,University of Kentucky,1921.0,Yorktown,Indiana
2,Leo Barnhorst,University of Notre Dame,1924.0,,
3,Ed Bartels,North Carolina State University,1925.0,,
4,Ralph Beard,University of Kentucky,1927.0,Hardinsburg,Kentucky


In [8]:
def get_season_stats():
    df_season_stats = pd.read_csv("../DATA/nba-players-stats/Seasons_Stats.csv")

    df_season_stats.drop(columns=["Unnamed: 0","blanl","blank2","Age","Year",],inplace=True)
    df_season_stats = df_season_stats[df_season_stats.Player.notnull()]
    df_season_stats = df_season_stats.dropna(thresh=50)
    return df_season_stats


In [9]:
df_season_stats = get_season_stats()

In [10]:
df_season_stats.head()

Unnamed: 0,Player,Pos,Tm,G,GS,MP,PER,TS%,3PAr,FTr,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS


In [11]:
def merging_dfs(df_players,df_player_data,df_season_stats):
    df_players_full = pd.merge(df_players,df_player_data ,how='outer', on='Player')
    complete_df = pd.merge(df_players_full,df_season_stats ,how='outer', on='Player')

    complete_df.drop_duplicates(subset ="Player", keep = "first", inplace = True)
    complete_df.drop(columns=["G","TS%","FTr","OWS","DWS","WS","FG","FGA","FG%","2P",
                              "2PA","2P%","eFG%","FT","FTA","AST","PF","PTS",
                              "FT%","GS","3PAr","TOV%","USG%","3P","3P%","3PA",
                              "TOV","BLK","STL","TRB","DRB","ORB","VORP","BPM","DBPM","OBPM",
                              "WS/48","BLK%","STL%","AST%","TRB%","DRB%","ORB%","MP","PER"],inplace=True)
    complete_df = complete_df.rename(columns={"Tm": "Team"})
    return complete_df

In [12]:
merged_df = merging_dfs(df_players, df_player_data, df_season_stats)

In [13]:
merged_df.head()

Unnamed: 0,Player,Height (m),Weight (kg),College,Born,collage,born,birth_city,birth_state,Pos,Team
0,Curly Armstrong,180.0,77.0,Indiana University,1918.0,Indiana University,1918.0,,,,
1,Cliff Barker,188.0,83.0,University of Kentucky,1921.0,University of Kentucky,1921.0,Yorktown,Indiana,,
2,Leo Barnhorst,193.0,86.0,University of Notre Dame,1924.0,University of Notre Dame,1924.0,,,,
3,Ed Bartels,196.0,88.0,North Carolina State University,1925.0,North Carolina State University,1925.0,,,,
4,Ralph Beard,178.0,79.0,University of Kentucky,1927.0,University of Kentucky,1927.0,Hardinsburg,Kentucky,,


In [14]:
URL = "https://ceoworld.biz/2019/03/05/these-are-50-highest-points-scorer-in-nba-history-1946-to-2019//"

#class Best_50: 

"""def __init__(self, url = URL): 
    self.url_nba_players = url"""

def scan(url): 
    results = requests.get(URL).text
    soup = BeautifulSoup(results, "html.parser")
    col_name = [element.text for element in soup.find_all("th")]
    player_info = [element.text for element in soup.find_all("td")]

    player_info_lst = []
    for ind in range(0, len(player_info), 11): 
        player_info_lst.append(player_info[ind: (ind+11)])


    player_df = pd.DataFrame(player_info_lst)
    col_df = pd.DataFrame(col_name)
    return (player_df, col_df)


In [15]:
players, columns = scan(URL)

In [16]:
print(players.head())
print(columns.head())

   0                    1      2   3     4      5      6     7     8     9  \
0  1  Kareem Abdul-Jabbar  38387  20  1560  15837  28307     1    18  6712   
1  2          Karl Malone  36928  19  1476  13528  26210    85   310  9787   
2  3          Kobe Bryant  33643  20  1346  11719  26200  1827  5546  8378   
3  4       Michael Jordan  32292  15  1072  12192  24537   581  1778  7327   
4  5         LeBron James  32280  16  1189  11735  23275  1712  4975  7098   

      10  
0   9304  
1  13188  
2  10011  
3   8772  
4   9636  
         0
0     Rank
1   Player
2   Points
3  Seasons
4    Games


In [17]:
def wrangle_webscraping(df1,df2):

    df2 = df2.transpose()
    df2 = df2.rename(columns=df2.iloc[0])
    df1 = df1.rename(columns={0: "Rank", 1: "Player",2: "Points",
                              3: "Seasons",4: "Games",5: "FGM",6: "FGA",
                              7: "3PM",8: "3PA",9: "FTM",10: "FTA",})
    return (df1, df2)


In [18]:
players, columns = wrangle_webscraping(players, columns)

In [19]:
print(players.head())
print(columns.head())

  Rank               Player Points Seasons Games    FGM    FGA   3PM   3PA  \
0    1  Kareem Abdul-Jabbar  38387      20  1560  15837  28307     1    18   
1    2          Karl Malone  36928      19  1476  13528  26210    85   310   
2    3          Kobe Bryant  33643      20  1346  11719  26200  1827  5546   
3    4       Michael Jordan  32292      15  1072  12192  24537   581  1778   
4    5         LeBron James  32280      16  1189  11735  23275  1712  4975   

    FTM    FTA  
0  6712   9304  
1  9787  13188  
2  8378  10011  
3  7327   8772  
4  7098   9636  
   Rank  Player  Points  Seasons  Games  FGM  FGA  3PM  3PA  FTM  FTA
0  Rank  Player  Points  Seasons  Games  FGM  FGA  3PM  3PA  FTM  FTA


In [20]:
def merge_webscraping(df_1, df_2):
    frames = [df_2, df_1]
    player_finalstats_df = pd.concat(frames)                        
    return (player_finalstats_df)


In [21]:
player_finalstats = merge_webscraping(players, columns)

In [38]:
len(player_finalstats)

51

In [23]:
def wrangle_Best_50(player_finalstats_df):

    player_finalstats_df = player_finalstats_df[player_finalstats_df.Rank != "Rank"]
    top_player_lst = player_finalstats_df["Player"].tolist()
    top_player_df = merged_df[merged_df['Player'].isin(top_player_lst)]
    return top_player_df

In [57]:
top_player = wrangle_Best_50(player_finalstats)

In [59]:
top_player.head()

Unnamed: 0,Player,Height (m),Weight (kg),College,Born,collage,born,birth_city,birth_state,Pos,Team
1245,Walter Davis,203.0,92.0,Texas A&M University,1931.0,Texas A&M University,1931.0,Beaumont,Texas,,
1481,Tom Chambers,208.0,99.0,University of Utah,1959.0,University of Utah,1959.0,Ogden,Utah,,
1538,Terry Cummings,206.0,99.0,DePaul University,1961.0,DePaul University,1961.0,Chicago,Illinois,,
2030,Clifford Robinson,206.0,99.0,University of Southern California,1960.0,University of Southern California,1960.0,Oakland,California,,
2401,Kevin Garnett,211.0,108.0,,1976.0,,1976.0,Mauldin,South Carolina,,


In [60]:
def final_NBA50_df(player_finalstats_df, top_player_df):
    master_player_df = pd.merge(top_player_df, player_finalstats_df ,how='left', on='Player')
    master_player_df = master_player_df[['Rank','Player','Points','FGM','FGA','3PM','3PA','FTM',
                                         'FTA','Games','Seasons','Team',
                                         "Height (m)",'Weight (kg)',]]
    master_player_df.set_index('Player')

    master_player_df = master_player_df.replace(to_replace = "-", value = 0)
    master_player_df[["Rank", "Points","FGM","FGA","FTM","FTA","3PM","3PA","Games","Seasons"]] = master_player_df[["Rank", "Points","FGM","FGA","FTM","FTA","3PM","3PA","Games","Seasons"]].apply(pd.to_numeric)
    return master_player_df

In [61]:
master_df = final_NBA50_df(player_finalstats, top_player)

In [62]:
master_df.head()

Unnamed: 0,Rank,Player,Points,FGM,FGA,3PM,3PA,FTM,FTA,Games,Seasons,Team,Height (m),Weight (kg)
0,48,Walter Davis,19521,8118,15871,157,577,3128,3676,1033,15,,203.0,92.0
1,43,Tom Chambers,20049,7378,15749,227,740,5066,6274,1107,16,,208.0,99.0
2,49,Terry Cummings,19460,8045,16628,44,149,3326,4711,1183,18,,206.0,99.0
3,47,Clifford Robinson,19591,7389,16875,1253,3515,3560,5165,1380,18,,206.0,99.0
4,17,Kevin Garnett,26071,10505,21142,174,632,4887,6190,1462,21,,211.0,108.0


In [31]:
def get_player_info():
    print(master_df["Player"])
    print("Enter Player Name:")
    x = input()
    return master_df[master_df["Player"] == x]

In [35]:
print(len(master_df["Player"]))

21


In [36]:
get_player_info()

0          Walter Davis
1          Tom Chambers
2        Terry Cummings
3     Clifford Robinson
4         Kevin Garnett
5             Ray Allen
6           Kobe Bryant
7            Tim Duncan
8          Vince Carter
9        Antawn Jamison
10        Dirk Nowitzki
11          Paul Pierce
12            Pau Gasol
13          Joe Johnson
14          Tony Parker
15      Carmelo Anthony
16         LeBron James
17          Dwyane Wade
18         Kevin Durant
19        Patrick Ewing
20          Gary Payton
Name: Player, dtype: object
Enter Player Name:
Tom Chambers


Unnamed: 0,Rank,Player,Points,FGM,FGA,3PM,3PA,FTM,FTA,Games,Seasons,Team,Height (m),Weight (kg)
1,43,Tom Chambers,20049,7378,15749,227,740,5066,6274,1107,16,,208.0,99.0
