In [17]:
import pandas as pd
import psycopg2
import numpy as np # linear algebra
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.express as px
import plotly.graph_objects as go
from plotly import subplots
import datetime as dt

In [18]:
# Store environment variable
from getpass import getpass
dbPassword = getpass('Enter database password')

In [19]:
param_dic = {
        'database': 'big-data-bowl',
        'user': 'postgres',
        'password': dbPassword,
        'host': '34.72.136.99',
        'port': 5432,
}
def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [20]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [21]:
# Connect to the database
conn = connect(param_dic)
column_names = ["nflId", "height", "weight", "birthDate", "collegeName", "officialPosition", "displayName"]
# Execute the "SELECT *" query
players = postgresql_to_dataframe(conn, "select * from players", column_names)
players.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,nflId,height,weight,birthDate,collegeName,officialPosition,displayName
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady
1,28963,6-5,240,1982-03-02,"Miami, O.",QB,Ben Roethlisberger
2,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters
3,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers
4,30078,6-2,228,1982-11-24,Harvard,QB,Ryan Fitzpatrick


Create new player variables

In [22]:
players['birthDate'].replace('NA','', inplace=True)
players['birthDate'] = pd.to_datetime(players['birthDate'])
players['age'] = players['birthDate'].map(lambda x: dt.date.today().year-x.year)
players['birthMonth'] = players['birthDate'].map(lambda x: x.month)
players['birthYear'] = players['birthDate'].map(lambda x: x.year)

players['heightCm'] = players['height'].map(lambda x: int(x.split('-')[0])*30.48+int(x.split('-')[1])*2.54)

print('dataframe size: {}'.format(players.shape))
players.head()

dataframe size: (1679, 11)


Unnamed: 0,nflId,height,weight,birthDate,collegeName,officialPosition,displayName,age,birthMonth,birthYear,heightCm
0,25511,6-4,225,1977-08-03,Michigan,QB,Tom Brady,45.0,8.0,1977.0,193.04
1,28963,6-5,240,1982-03-02,"Miami, O.",QB,Ben Roethlisberger,40.0,3.0,1982.0,195.58
2,29550,6-4,328,1982-01-22,Arkansas,T,Jason Peters,40.0,1.0,1982.0,193.04
3,29851,6-2,225,1983-12-02,California,QB,Aaron Rodgers,39.0,12.0,1983.0,187.96
4,30078,6-2,228,1982-11-24,Harvard,QB,Ryan Fitzpatrick,40.0,11.0,1982.0,187.96


PART 1 - Simple Analysis

About 75% of the players are under 32 years old

The youngest players are 23 years old, and there are 32 of them

The oldest player is 45 years old

In [23]:
round(players.age.value_counts(normalize=True, bins=5)*100, 2)

(22.977, 27.4]    44.97
(27.4, 31.8]      30.79
(31.8, 36.2]       9.59
(36.2, 40.6]       0.71
(40.6, 45.0]       0.12
Name: age, dtype: float64

In [24]:
fig = px.density_heatmap(players, x='heightCm', y='weight', title='Player height and weight distribution')
fig.show()

In [25]:
fig = px.scatter(players, x='heightCm', y='weight', trendline="ols")
fig.show()

In [26]:
players_by_age = players.groupby('age').count().reset_index()
players_by_age = players_by_age.rename(columns={"nflId" : "count"})
fig = px.bar(players_by_age, x='age', y='count', title="Count of Players by Ages")
fig.show()

In [27]:
fig = px.scatter_3d(
    players, x='age', y='heightCm', z='weight',
    color='officialPosition', opacity=0.7
)
fig.update_layout(height=800, title='Player stats of different positions')
fig.show()