# Exercise: Visual - Assignment 4

## Question: 
"What is the correlation between precipitation and the virus Zika over the time period 2015-2017, for a number of states in US (Kansas, California, Montana, Michigan, Georgia)?"

In [None]:
d=open('Details.txt', 'r')
print (d.read())

In [None]:
%matplotlib notebook

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def read_weather():
    temp_data = pd.DataFrame() #Clear temp_data
    file_list = ['kansas.csv', 'california.csv', 'montana.csv', 'michigan.csv', 'georgia.csv']
    _dict = {4:'California', 9:'Georgia', 20:'Michigan', 14:'Kansas', 24:'Montana'}
    _list=[]
    for files in file_list:
        temp_data=pd.read_csv(files, delimiter=',') #Read weather data
        temp_data['StateCode']=temp_data['StateCode'].map(_dict) #Replace state_codes to state_names
        temp_data.set_index('StateCode', inplace=True)
        temp_data.index.rename('States', inplace=True)
        _list.append(temp_data)
    temp_data = pd.concat(_list)
    temp_data = temp_data[['YearMonth', 'PCP']] #Keep significant weather data
    temp_data.rename(columns={'PCP':'Precipitation', 'YearMonth':'Year'}, inplace=True)
    #Format 'Year' values into string, keep only year
    temp_data['Year'] = temp_data['Year'].astype(str).apply(lambda x: (x[:4]))
    #Keep max & min for each state & year
    temp_data = temp_data.groupby([temp_data.index,'Year'])['Precipitation'].agg(['max','min']) 
    temp_data.rename(columns={'min':'Precipitation_min', 'max':'Precipitation_max'}, inplace=True)
    return 

def read_virus():
    temp_data = pd.DataFrame() #Clear temp_data
    file_list = ['Zika 2015.xlsx', 'Zika 2016.xlsx', 'Zika 2017.xlsx']
    list_states = ['Georgia', 'California', 'Montana', 'Michigan', 'Kansas']
    _list=[]
    for files in file_list:
        temp_data = pd.read_excel(files, header=3) #Read weather data
        temp_data = temp_data[['States','No.','No..1']]
        temp_data['Year'] = files[5:9]
        temp_data['Cases'] = temp_data['No.'] + temp_data['No..1'] #Add cases
        temp_data.drop(['No.','No..1'],axis=1, inplace=True) 
        temp_data = temp_data.loc[temp_data['States'].isin(list_states)] #Keep only States in list
        temp_data.set_index('States', inplace=True)
        _list.append(temp_data)
    temp_data = pd.concat(_list)
    #Function is needed (eg. sum) as to convert groupby object to DataFrame again
    temp_data = temp_data.groupby([temp_data.index, 'Year'])[['Cases']].sum() 
    return 

def visual():
    #Categorize case number, necessary for best illustration in legend
    df['Cases_category'] = pd.cut(df['Cases'], bins=[0, 100, 200, 300, 400, 500], include_lowest=True)
    df.reset_index(inplace=True) #So that the seaborn can handle data easily
    sns.set_style('white')
    fg1 = sns.factorplot('Year', y='Precipitation_max', hue='Cases_category' , 
                        col='States', col_wrap=3, data=df, kind='strip', size=3, aspect=1,
                        palette='Reds', legend=False, sharex=False, sharey=False);
    fg1.set_axis_labels('Year', 'Maximum Precipitation').set(ylim=(0, 10)).set_titles('{col_name}');
    plt.legend(title='Zika Cases \n(Categorized by total number)', bbox_to_anchor=(1.1, 1), loc=2);
    plt.savefig('fg1.png');
    return 

data_w = read_weather()
data_v = read_virus()

df = pd.DataFrame()
df = pd.merge(data_w, data_v, left_index=True, right_index=True)

visual()