# INFO 3402 – Class 27 – Database setup

[Brian C. Keegan, Ph.D.](http://brianckeegan.com/)  
[Assistant Professor, Department of Information Science](https://www.colorado.edu/cmci/people/information-science/brian-c-keegan)  
University of Colorado Boulder  

Copyright and distributed under an [MIT License](https://opensource.org/licenses/MIT)  

In [8]:
import numpy as np
import pandas as pd

Create database checklist

* Create free-tier RDS database: https://us-west-2.console.aws.amazon.com/rds/home?region=us-west-2  
* Change AWS security group inbound to all traffic, all protocols, all ports, anywhere
* Install MySQL Connector for python: `conda install mysql-connector-python`
* Change "student" user privileges to include select and show view

In [37]:
from sqlalchemy import create_engine

host = 'info3402-f19-baseball.cjuyvrfem14z.us-west-2.rds.amazonaws.com'
user = 'admin'
port = 3306
password = 'cuboulderinfo3402'
database = 'innodb'

# dialect[+driver]://user:password@host/dbname[?key=value..]
engine = create_engine('mysql+mysqlconnector://{0}:{1}@{2}:{3}/{4}'.format(user,password,host,port,database),
                       pool_size=10,
                       max_overflow=20,
                       connect_args={'connect_timeout': 10000})
conn = engine.connect()

## Prepare gamelogs

In [9]:
gamelog_files = [i for i in os.listdir('./Gamelogs/') if '.TXT' in i]
sorted(gamelog_files)[:5]

['GL1871.TXT', 'GL1872.TXT', 'GL1873.TXT', 'GL1874.TXT', 'GL1875.TXT']

In [15]:
_df = pd.read_csv('E:/Dropbox/Courses/2017 Spring - INFO 2201/Code/GL2016_columns.csv')
gamelog_columns = _df.columns[1:].tolist()

In [16]:
master_gamelog_concat_dict = dict()

for i,filename in enumerate(gamelog_files):
    _df = pd.read_csv('./Gamelogs/'+filename,header=None)
    _df.columns = gamelog_columns
    master_gamelog_concat_dict[1871+i] = _df

In [17]:
complete_gamelog_df = pd.concat(master_gamelog_concat_dict.values(),
                                keys=master_gamelog_concat_dict.keys(),
                                names=['season','index']).reset_index(0).reset_index(drop=True)
print("There are {0:,} games in the gamelog".format(len(complete_gamelog_df)))
complete_gamelog_df.head()

There are 218,168 games in the gamelog


Unnamed: 0,season,date,game_number,day,visitor,visitor_lg,visitor_game_number,home,home_lg,home_game_number,...,home_batter_7_name,home_batter_7_pos,home_batter_8_id,home_batter_8_name,home_batter_8_pos,home_batter_9_id,home_batter_9_name,home_batter_9_pos,additional_info,acquisition
0,1871,18710504,0,Thu,CL1,na,1,FW1,na,1,...,Ed Mincher,7.0,mcdej101,James McDermott,8.0,kellb105,Bill Kelly,9.0,,Y
1,1871,18710505,0,Fri,BS1,na,1,WS3,na,1,...,Asa Brainard,1.0,burrh101,Henry Burroughs,9.0,berth101,Henry Berthrong,8.0,HTBF,Y
2,1871,18710506,0,Sat,CL1,na,2,RC1,na,1,...,Pony Sager,6.0,birdg101,George Bird,7.0,stirg101,Gat Stires,9.0,,Y
3,1871,18710508,0,Mon,CL1,na,3,CH1,na,1,...,Ed Duffy,6.0,pinke101,Ed Pinkham,5.0,zettg101,George Zettlein,1.0,,Y
4,1871,18710509,0,Tue,BS1,na,2,TRO,na,1,...,Steve Bellan,5.0,pikel101,Lip Pike,3.0,cravb101,Bill Craver,6.0,HTBF,Y


In [38]:
complete_gamelog_df.to_sql(name='Gamelogs',con=conn,if_exists='replace',index=False,chunksize=1000)

## Prepare Lahman database

In [64]:
tables = pd.read_sql_query('show tables',conn)['Tables_in_innodb'].values.tolist()

In [56]:
lahman_files = [f for f in os.listdir('./Lahman/') if '.csv' in f]

for f in lahman_files:
    table_name = f.split('.csv')[0]
    if table_name not in tables:
        _df = pd.read_csv('./Lahman/'+f)
        try:
            _df.to_sql(name=table_name,con=conn,if_exists='replace',index=False,chunksize=1000)
        except:
            print(table_name)
            pass

## Appendix: Google BigQuery

In [3]:
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    'bigquery_credentials.json')

scoped_credentials = credentials.with_scopes(
    ['https://www.googleapis.com/auth/cloud-platform'])

In [14]:
from google.cloud import bigquery

client = bigquery.Client(credentials=credentials,project='dulcet-cat-257216')

# Perform a query.
QUERY = (
    'SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013` '
    'WHERE state = "TX" '
    'LIMIT 100')
query_job = client.query(QUERY)  # API request
df = query_job.to_dataframe()  # Waits for query to finish

Unnamed: 0,state,gender,year,name,number
0,TX,F,1910,Frances,197
1,TX,F,1910,Alice,149
2,TX,F,1910,Beatrice,123
3,TX,F,1910,Ella,102
4,TX,F,1910,Gertrude,97
...,...,...,...,...,...
95,TX,F,1911,Rosa,76
96,TX,F,1911,Lucy,61
97,TX,F,1911,Nora,61
98,TX,F,1911,Nettie,45
