In [None]:
import pandas as pd
import psycopg2
import pyodbc
import datetime
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
#Passwords
ONEVIEW_PASSWORD = ''
EDW_USERNAME = ''
EDW_PASSWORD = ''
AIMS_USERNAME = ''
AIMS_PASSWORD = ''

In [None]:
#OneView connection
conn = psycopg2.connect(
    host='oneview-prd-content-aurora-cluster.cluster-cxgp9osuwqi3.us-east-1.rds.amazonaws.com',
    database='oneview_content',
    user='oneviewadmin',
    password= ONEVIEW_PASSWORD)

In [None]:
#EDW connection
w = "DSN=prddw; UID={}; PWD={}".format(EDW_USERNAME, EDW_PASSWORD)
AMAEDW = pyodbc.connect(w)

In [None]:
#Define universe
ov_me_sql = '''
    SELECT medical_education_number FROM ONEVIEW.physician
    '''
ov_me = pd.read_sql_query(ov_me_sql, conn)

In [None]:
#Get party ids
ME_QUERY = \
    """
    SELECT
    P.PARTY_ID,
    P.KEY_VAL AS ME
    FROM
    AMAEDW.PARTY_KEY P
    WHERE
    P.KEY_TYPE_ID = 18
    AND
    P.ACTIVE_IND = 'Y'
    """
party_ids = pd.read_sql(con=AMAEDW, sql=ME_QUERY)

In [None]:
#get school party ids
MORE_SCHOOL_QUERY = \
    """
    SELECT
    PARTY_ID AS PARTY_ID_SCHOOL,
    KEY_VAL AS SCHOOL_ID
    FROM
    AMAEDW.PARTY_KEY
    WHERE
    KEY_TYPE_ID = 23
    """
more_school_ids = pd.read_sql(con=AMAEDW, sql=MORE_SCHOOL_QUERY)

In [None]:
#get school information
SCHOOL_QUERY = \
    """
    SELECT
    P.PARTY_ID AS PARTY_ID_SCHOOL,
    P.KEY_VAL AS SCHOOL_ID,
    C.STATE_ID,
    C.COUNTRY_ID
    FROM
    AMAEDW.PARTY_KEY P, AMAEDW.POST_CD C, AMAEDW.PARTY_ADDR A
    WHERE
    P.KEY_TYPE_ID = 23
    AND
    P.PARTY_ID = A.PARTY_ID
    AND 
    A.POST_CD_ID = C.POST_CD_ID
    """
school_ids = pd.read_sql(con=AMAEDW, sql=SCHOOL_QUERY)

In [None]:
#get med school graduation info
MED_SCHOOL_QUERY = \
    """
    SELECT
    SCH_PARTY_ID,
    STU_PARTY_ID AS PARTY_ID,
    GRAD_DT,
    STS_TYPE_ID,
    DEGREE_CD,
    GRAD_CONF_IND
    FROM
    AMAEDW.SCHOOL_ATT
    WHERE
    THRU_DT is null
    """
med_school = pd.read_sql(con=AMAEDW, sql=MED_SCHOOL_QUERY)

In [None]:
#get GME info
GME_QUERY = \
    """
    SELECT
    G.PARTY_HOSPITAL_ID,
    G.PERSON_PARTY_ID AS PARTY_ID,
    G.BEGIN_DT,
    G.END_DT,
    G.PRIM_SPEC_ID,
    G.SEC_SPEC_ID,
    G.TRAIN_TYPE,
    G.CONF_STS_ID,
    G.GME_STS_TYPE_CD,
    C.STATE_ID,
    C.COUNTRY_ID
    FROM
    AMAEDW.EDW_GME G, AMAEDW.POST_CD C, AMAEDW.PARTY_ADDR A
    WHERE
    G.THRU_DT is null
    AND
    G.PARTY_HOSPITAL_ID = A.PARTY_ID
    AND 
    A.POST_CD_ID = C.POST_CD_ID
    """
gme = pd.read_sql(con=AMAEDW, sql=GME_QUERY)

In [None]:
#get gme years
YEAR_QUERY = \
    """
    SELECT
    PARTY_ID, 
    PROG_YEAR,
    PROG_GRAD_YR
    FROM
    AMAEDW.MED_PROF
    WHERE
    THRU_DT is null
    """
gme_year = pd.read_sql(con=AMAEDW, sql=YEAR_QUERY)

In [None]:
#get hospital names
ORG_QUERY = \
    """
    SELECT
    PARTY_ID AS PARTY_HOSPITAL_ID,
    ORG_NM, 
    THRU_DT
    FROM
    AMAEDW.ORG_NM
    """
org_names = pd.read_sql(con=AMAEDW, sql=ORG_QUERY)

In [None]:
#get latest organization names
org_names.THRU_DT = pd.to_datetime(org_names.THRU_DT)
org_names = org_names.sort_values('THRU_DT').drop_duplicates('PARTY_HOSPITAL_ID', keep='last')

In [None]:
#remove duplicate school ids
school_ids = school_ids.drop_duplicates()

In [None]:
#find people with only one gme 
gme_singular = gme.drop_duplicates('PARTY_ID')

In [None]:
#concat all schools
extra = more_school_ids[~more_school_ids.SCHOOL_ID.isin(school_ids.SCHOOL_ID)]
school_ids = pd.concat([school_ids, extra])

In [None]:
#add information to medical school and gme tables
test = pd.merge(party_ids, ov_me[['medical_education_number', 'type']], left_on='ME', right_on='medical_education_number')
school_info = pd.merge(school_ids, org_names, left_on='PARTY_ID_SCHOOL', right_on='PARTY_HOSPITAL_ID', how='left').drop_duplicates()
med_with_info = pd.merge(med_school, school_info, left_on='SCH_PARTY_ID', right_on='PARTY_ID_SCHOOL', how='left').drop_duplicates()
gme_with_info = pd.merge(gme_singular, org_names, on='PARTY_HOSPITAL_ID', how='left').drop_duplicates()

In [None]:
#change grad date to datetime
date_list = []
for row in med_with_info.itertuples():
    try:
        new_date = datetime.datetime.strptime(str(row.GRAD_DT), '%Y-%m-%d')
    except:
        new_date = None
    date_list.append(new_date)
med_with_info['GRAD_DATE'] = date_list
med_with_info = med_with_info.sort_values('GRAD_DATE')

In [None]:
#add country and status codes
countries = []
statuses = []
for row in med_with_info.itertuples():
    country = 0
    status = 0
    if row.COUNTRY_ID == 6705:
        country = 1
    if row.STS_TYPE_ID == 9:
        status = 2
    if row.STS_TYPE_ID == 54:
        status = 1
    countries.append(country)
    statuses.append(status)
med_with_info['COUNTRY'] = countries
med_with_info['STATUS'] = statuses
med_with_info = med_with_info.sort_values(['STATUS','COUNTRY','GRAD_DATE'])

In [None]:
#when people have multiple medical school entries, grab the most recent
singular_medical_schools = med_with_info.drop_duplicates('PARTY_ID', keep=False)
multiple_medical_schools = med_with_info[med_with_info.duplicated('PARTY_ID', keep=False)]
latest_medical_school = multiple_medical_schools.drop_duplicates('PARTY_ID', keep='last')
med_with_info_2 = pd.concat([singular_medical_schools, latest_medical_school])

In [None]:
#merge with pandas
ALL = pd.merge(test, gme_year, on='PARTY_ID', how='left')
ALL = pd.merge(ALL, med_with_info_2, on='PARTY_ID',how='left')
ALL = pd.merge(ALL, gme_with_info, on='PARTY_ID', how='left', suffixes = ['_SCH', '_GME'])