In [140]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

sns.set()

print('All dependencies installed')

All dependencies installed


In [163]:
game_data = pd.read_csv('../input/game_data.csv', parse_dates=True)
print(game_data.shape)
game_data.head()

(666, 18)


Unnamed: 0,GameKey,Season_Year,Season_Type,Week,Game_Date,Game_Day,Game_Site,Start_Time,Home_Team,HomeTeamCode,Visit_Team,VisitTeamCode,Stadium,StadiumType,Turf,GameWeather,Temperature,OutdoorWeather
0,1,2016,Pre,1,2016-08-07 00:00:00.000,Sunday,Indianapolis,20:00,Indianapolis Colts,IND,Green Bay Packers,GB,Tom Benson Hall of Fame Stadium,Outdoor,Turf,,,
1,2,2016,Pre,2,2016-08-13 00:00:00.000,Saturday,Los Angeles,17:00,Los Angeles Rams,LA,Dallas Cowboys,DAL,Los Angeles Memorial Coliseum,Outdoor,Grass,Sunny,79.0,Sunny
2,3,2016,Pre,2,2016-08-11 00:00:00.000,Thursday,Baltimore,19:30,Baltimore Ravens,BLT,Carolina Panthers,CAR,M&T Bank Stadium,Outdoor,Natural Grass,Party Cloudy,94.0,Partly Cloudy
3,4,2016,Pre,2,2016-08-12 00:00:00.000,Friday,Green Bay,19:00,Green Bay Packers,GB,Cleveland Browns,CLV,Lambeau Field,Outdoor,DD GrassMaster,,73.0,
4,5,2016,Pre,2,2016-08-11 00:00:00.000,Thursday,Chicago,19:00,Chicago Bears,CHI,Denver Broncos,DEN,Soldier Field,Outdoor,Grass,"Partly Cloudy, Chance of Rain 80%",88.0,


# Checking column types:

In [164]:
game_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 18 columns):
GameKey           666 non-null int64
Season_Year       666 non-null int64
Season_Type       666 non-null object
Week              666 non-null int64
Game_Date         666 non-null object
Game_Day          666 non-null object
Game_Site         666 non-null object
Start_Time        666 non-null object
Home_Team         666 non-null object
HomeTeamCode      666 non-null object
Visit_Team        666 non-null object
VisitTeamCode     666 non-null object
Stadium           666 non-null object
StadiumType       628 non-null object
Turf              665 non-null object
GameWeather       568 non-null object
Temperature       600 non-null float64
OutdoorWeather    412 non-null object
dtypes: float64(1), int64(3), object(14)
memory usage: 93.7+ KB


## Changing column types for less memory usage:

In [180]:
category_columns = ['Season_Type', 'StadiumType', 'Turf']
float_columns = ['Temperature']

game_data[category_columns] = game_data[category_columns].astype('category')
game_data[float_columns] = game_data[float_columns].astype(float)
date = pd.to_datetime(game_data['Game_Date'].str.split(expand=True)[0], format='%Y-%m-%d')

In [181]:
game_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666 entries, 0 to 665
Data columns (total 18 columns):
GameKey           666 non-null int64
Season_Year       666 non-null int64
Season_Type       666 non-null category
Week              666 non-null int64
Game_Date         666 non-null object
Game_Day          666 non-null object
Game_Site         666 non-null object
Start_Time        666 non-null object
Home_Team         666 non-null object
HomeTeamCode      666 non-null object
Visit_Team        666 non-null object
VisitTeamCode     666 non-null object
Stadium           666 non-null object
StadiumType       628 non-null category
Turf              665 non-null category
GameWeather       568 non-null object
Temperature       600 non-null float64
OutdoorWeather    412 non-null object
dtypes: category(3), float64(1), int64(3), object(11)
memory usage: 82.5+ KB


# Checking for Null values:

In [80]:
np.sum(game_data.isnull())

GameKey             0
Season_Year         0
Season_Type         0
Week                0
Game_Date           0
Game_Day            0
Game_Site           0
Start_Time          0
Home_Team           0
HomeTeamCode        0
Visit_Team          0
VisitTeamCode       0
Stadium             0
StadiumType        38
Turf                1
GameWeather        98
Temperature        66
OutdoorWeather    254
dtype: int64

## Checking whether or not we need the columns with missing values:

In [81]:
stadium_type = game_data['StadiumType'].value_counts()
turf = game_data['Turf'].value_counts()
game_weather = game_data['GameWeather'].value_counts()
temperature = game_data['Temperature'].value_counts()
outdoor_weather = game_data['OutdoorWeather'].value_counts

print(stadium_type, '\n', '-'*50, '\n', turf, '\n', '-'*50, '\n', game_weather, '\n', '-'*50, '\n', temperature,  '\n', '-'*50, '\n', outdoor_weather)

Outdoor                         348
Outdoors                         81
Indoors                          46
Indoor                           42
Retractable Roof                 21
Dome                             21
Open                             20
Retr. Roof - Closed               9
Retr. Roof-Closed                 4
outdoor                           4
Retr. Roof - Open                 3
Domed, closed                     2
Dome, closed                      2
Indoor, non-retractable roof      2
Retr. Roof-Open                   2
Outdoors                          2
Indoors (Domed)                   2
Closed Dome                       2
Retr. roof - closed               1
Indoor, Roof Closed               1
Retr. Roof Closed                 1
Oudoor                            1
Outdor                            1
Ourdoor                           1
Indoor, fixed roof                1
Indoor, Non-Retractable Dome      1
Indoor, Open Roof                 1
Indoor, Fixed Roof          

stadium_type = most of the values are repeated or labelled differently but mean the same. Most staudiums also are outdoors with a few with retractable roofs. The few that are missing can be searched easily. <br>
turf = same with stadium_type, most values are the same but labelled differently and can be searched easily. <br>
game_weather = around 15% of the values are missing in this column. This can be searched but will be more time consuming. <br>
temperature = around 10% of the values are missing in this column, which seems a little weird since game_weather has more missing values. <br>
outdoor_weather = around 38% of the data is mising and may potentially not have a very big impact in the analysis of data.