# Project 4 - Pandas for DS

# Processing details for Hypothesis Testing #

## Definitions:##
* A _quarter_ is a specific three month period, Q1 is January through March, Q2 is April through June, Q3 is July through September, Q4 is October through December.
* A _recession_ is defined as starting with two consecutive quarters of GDP decline, and ending with two consecutive quarters of GDP growth.
* A _recession bottom_ is the quarter within a recession which had the lowest GDP.
* A _university town_ is a city which has a high percentage of university students compared to the total population of the city.

## The data files information:##
* United States housing data from the Zillow research data site for [all homes at a city level](http://files.zillowstatic.com/research/public/City/City_Zhvi_AllHomes.csv), 
  ```City_Zhvi_AllHomes.csv```, has median home sale prices at a fine grained level.
* From the Wikipedia page on college towns is a list of [university towns in the United States](https://en.wikipedia.org/wiki/List_of_college_towns#College_towns_in_the_United_States) which has been copy and pasted into the file ```university_towns.txt```.
* From Bureau of Economic Analysis, US Department of Commerce, the [GDP over time](http://www.bea.gov/national/index.htm#gdp) of the United States in current dollars (use the chained value in 2009 dollars), in quarterly intervals, in the file ```gdplev.xls```. For this assignment, only look at GDP data from the first quarter of 2000 onward.

                                                                                   
## **Hypothesis**:##
>University towns have their mean housing prices less affected by recessions.
                                                                                   
###### Run a t-test to compare the ratio of the mean price of houses in university towns the quarter before the recession starts compared to the recession bottom. 

>(```price_ratio = quarter_before_recession/recession_bottom```)
                         

In [1]:
import pandas as pd
import numpy as np

##  Helper functions ................................................................
##
def get_rec_qtr(gdp, what='start', start_qtr_flag=True):
    qtr = None
    which = { 'start':True, 'end':False}
    
    lix = len(gdp.index)
    
    if which[what]: 
        for i in range(2, lix):
            if ( (gdp.BN_dollar_2009[i-2] > gdp.BN_dollar_2009[i-1]) & 
                 (gdp.BN_dollar_2009[i-1] > gdp.BN_dollar_2009[i]) ):
            
                if start_qtr_flag:
                    qtr = gdp.index[i-1]
                else:
                    qtr = gdp.index[i-2]
                break
    else:
        for i in range(2, lix):
            if ( (gdp.BN_dollar_2009[i-2] < gdp.BN_dollar_2009[i-1]) &
                 (gdp.BN_dollar_2009[i-1] < gdp.BN_dollar_2009[i]) ):
                qtr = gdp.index[i]
                break
            else:
                qtr = None
            
    return qtr


def get_GDP():
    """
    File gdplev.xls has the GDP of the United States in quarterly intervals
    in current dollars (use the chained value in 2009 dollars).
    Only look at GDP data from the first quarter of 2000 onward.
    """
    df = pd.read_excel( './data/gdplev.xls')
    df = df[[ df.columns[-4], df.columns[-2] ]]
    df.rename_axis( {df.columns[0]:'Q', df.columns[1]:'BN_dollar_2009'}, axis=1, inplace=True)
    
    # Get yr 2000 onward:
    B2K = df.loc[df.Q.str[:4]=='2000'].index[0]
    df.drop(df.index[0:B2K], axis=0, inplace=True)
    
    df['BN_dollar_2009'] = df['BN_dollar_2009'].astype('float64')
    df['Q'] = pd.PeriodIndex( df['Q'], freq='Q-DEC')
    df = df.set_index('Q')
    # return Period to string
    df.rename_axis(lambda x: str(x).lower(), axis=0, inplace=True)

    return df

#  Answer functions  ..................................................................................
#
def get_list_of_university_towns():
    utowns = pd.read_table( './data/university_towns.txt', sep='\n', header=None, names = ['RegionName'])
    
    # cleanup here
    states_idx = utowns[utowns.RegionName.str.contains('edit')].index
    utowns['State'] = utowns.RegionName.str.extract('([\w\s]+)\[edit\]', expand=False).fillna(method='ffill')
    
    # drop & re-index after row-drop
    utowns.drop(states_idx, axis=0, inplace=True)
    utowns.reset_index(inplace=True)
    
    utowns['RegionName'] = utowns.RegionName.str.extract('([\w\s,]+)\s\(', expand=False)
    utowns['RegionName'] = utowns.RegionName.str.strip()
    
    # reorder columns
    utowns = utowns[['State', 'RegionName']]

    return utowns


def get_recession_start():
    """
    A recession:: start = two consecutive quarters of GDP decline
                  end   = two consecutive quarters of GDP growth.
    Returns the recession start as a string value in this format 2005q3
    start_qtr=False is used in the ttest function to get preceding qtr.
    """
    GDP = get_GDP()
    q1 = get_rec_qtr(GDP, what='start')
    return q1


def get_recession_end():
    """
    A recession:: start = two consecutive quarters of GDP decline
                  end   = two consecutive quarters of GDP growth.
    Returns the year and quarter of the recession end time as a 
    string value in this format 2005q3
    """
    rec_start = get_recession_start()
    GDP = get_GDP()
    GDP = GDP[rec_start:]
    
    q2 = get_rec_qtr(GDP, what='end')
    return q2


def get_recession_bottom():
    """
    A recession bottom is the quarter within a recession which had the lowest GDP.
    Returns the year and quarter of the recession bottom time as a 
    string value in this format 2005q3
    """
    rec_start = get_recession_start()
    rec_end = get_recession_end()

    GDP = get_GDP()
    GDP = GDP[rec_start:rec_end]
    
    min_val = GDP.BN_dollar_2009.min()
    min_idx = GDP[GDP.BN_dollar_2009 == min_val].index

    # below: to use if GDP index is kept as quarterly Period:
    #rec_bottom = '{}q{}'.format(min_idx[0].year, min_idx[0].quarter )
    rec_bottom = min_idx[0]

    return rec_bottom


def convert_housing_data_to_quarters():
    states = {'OH': 'Ohio', 'KY': 'Kentucky', 'AS': 'American Samoa', 'NV': 'Nevada', 
          'WY': 'Wyoming', 'NA': 'National', 'AL': 'Alabama', 'MD': 'Maryland', 
          'AK': 'Alaska', 'UT': 'Utah', 'OR': 'Oregon', 'MT': 'Montana', 'IL': 'Illinois', 
          'TN': 'Tennessee', 'DC': 'District of Columbia', 'VT': 'Vermont', 'ID': 'Idaho',
          'AR': 'Arkansas', 'ME': 'Maine', 'WA': 'Washington', 'HI': 'Hawaii', 
          'WI': 'Wisconsin', 'MI': 'Michigan', 'IN': 'Indiana', 'NJ': 'New Jersey', 
          'AZ': 'Arizona', 'GU': 'Guam', 'MS': 'Mississippi', 'PR': 'Puerto Rico', 
          'NC': 'North Carolina', 'TX': 'Texas', 'SD': 'South Dakota', 
          'MP': 'Northern Mariana Islands', 'IA': 'Iowa', 'MO': 'Missouri', 'CT': 'Connecticut',
          'WV': 'West Virginia', 'SC': 'South Carolina', 'LA': 'Louisiana', 'KS': 'Kansas', 
          'NY': 'New York', 'NE': 'Nebraska', 'OK': 'Oklahoma', 'FL': 'Florida', 'CA': 'California',
          'CO': 'Colorado', 'PA': 'Pennsylvania', 'DE': 'Delaware', 'NM': 'New Mexico', 
          'RI': 'Rhode Island', 'MN': 'Minnesota', 'VI': 'Virgin Islands', 'NH': 'New Hampshire', 
          'MA': 'Massachusetts', 'GA': 'Georgia', 'ND': 'North Dakota', 'VA': 'Virginia'}
    
    """
    City_Zhvi_AllHomes.csv, has median home sale prices at a fine grained level.
    Converts the housing data to quarters and returns it as mean values in a dataframe. 
    This dataframe should be a dataframe with columns for 2000q1 through 2016q3, and 
    should have a multi-index in the shape of ["State","RegionName"].
    The resulting dataframe should have 67 columns, and 10,730 rows.
    """
    housing = pd.read_csv( './data/City_Zhvi_AllHomes.csv', header=0)
    
    housing.drop(housing.columns[3:51], axis=1, inplace=True)
    housing.drop(housing.columns[0], axis=1, inplace=True)
    
    # change the state abbreviation to full name to match university towns df:
    housing["State"].replace(states, inplace = True)
    
    # set MultiIndex
    housing = housing.set_index(['State','RegionName'])
    
    # group into quarters Period
    qtr_cols = pd.PeriodIndex(housing.columns, freq='M').asfreq('Q-DEC', 's')
    
    # grouping per quarterly Period:
    grouped = housing.groupby(qtr_cols, axis=1).agg(np.mean)
    
    # back conversion to string to get the required lowercase format '2000q1':
    grouped = grouped.rename(columns=lambda x: str(x).lower())
    
    return grouped


def run_ttest():
    from scipy.stats import ttest_ind
    import pandas as pd
    """
    Runs a ttest comparing the university town values to the non-university towns values, 
    return whether the alternative hypothesis (that the two groups are the same)
    is true or not as well as the p-value of the confidence. 
    Return the tuple (different, p, better) where 
    different=True if the t-test is True at a p<0.01 (we reject the null hypothesis), or 
    different=False if otherwise (we cannot reject the null hypothesis). 
    The variable p should be equal to the exact p value returned from scipy.stats.ttest_ind(). 
    The value for better should be either "university town" or "non-university town"
    depending on which has a lower mean price ratio (which is equivalent to a reduced market loss).
    """
    
    """create new data showing the decline or growth of housing prices
    between the recession start and the recession bottom."""
    
    utowns = get_list_of_university_towns()
    hous_df1 = convert_housing_data_to_quarters()
    
    hous = pd.merge( hous_df1.reset_index(), utowns, 
                     on=utowns.columns.tolist(), 
                     indicator='_id', how='outer' )

    # get the 2 periods of interest
    GDP = get_GDP()
    q_bef_start = get_rec_qtr(GDP, what='start', start_qtr_flag=False)
    rec_bottom = get_recession_bottom()
    
    hous['Ratio'] = hous[q_bef_start].div(hous[rec_bottom])
    
    # u-towns data
    hous_utowns = hous[hous['_id'] == 'both']
    Ratio_u = hous_utowns.Ratio.dropna()
    
    # others
    hous_not_utowns = hous[hous['_id'] != 'both']
    Ratio_not_u = hous_not_utowns.Ratio.dropna()
    
    """
    The value for better should be either "university town" or "non-university town"
    depending on which has a lower mean price ratio (which is equivalent to a reduced market loss).
    """
    better = 'university town' if (Ratio_u.mean() < Ratio_not_u.mean()) else 'non-university town'
    
    """
    Run a ttest comparing the university town values to the non-university towns values, 
    return whether the alternative hypothesis **(that the two groups are the same)**
    is true or not as well as the p-value of the confidence. 
    #
    Is Docstring right?? 
    If alt hypo == "the two groups are the same", then the null hypo,
            H_0 == "two population w/unequal variances => Welch's test
            ==> equal_var=False needed in the function call.
    #
    ttest_ind <equal_var> Parameter:
        True (def):: perform a standard independent 2 sample test where H_0 = equal variances
        False     :: perform Welch's t-test where H_0 = unequal population variances
        
    However, the answer is not accepted when the above logic is applied.
    #
    Return the tuple (different, p, better) where 
    different=True if the t-test is True at a p<0.01 (we reject the null hypothesis), or 
    different=False if otherwise (we cannot reject the null hypothesis).
    """
    stats, pval = ttest_ind( Ratio_u, Ratio_not_u, equal_var=False )
    different = (pval < 0.01)
  
    return (different, pval, better)

In [None]:
get_list_of_university_towns()

In [None]:
get_recession_start()

In [None]:
get_recession_end()

In [None]:
get_recession_bottom()

In [None]:
convert_housing_data_to_quarters()

In [2]:
run_ttest()

(True, 0.00044454912338074125, 'university town')