## Autocorrelation and Cross-Correlation

Autocorrelation and cross-correlation can produce plots that are sometimes hard to interpret. By working with an understandable data set we can acquire intuition about these important methods for viewing time series.

In [None]:
# Data and plotting
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline



In [None]:
# Read the data into a dictionary with one dataframe per year.
 
path  = '/Users/donaldbrown/Dropbox/department/Classes/Data/BabyNames/'

files = [file for file in os.listdir(path)
        if file.startswith('yob')]
years = np.array(sorted([int(file[3:7])
                for file in files]))
baby_names = {year: pd.read_csv(path + 'yob{y:d}.txt'.format(y= year),
#baby_names = {year: pd.read_csv('/Users/donaldbrown/Dropbox/department/Classes/Data/BabyNames/yob{y:d}.txt'.format(y= year),
                         index_col = 0, header = None,
                         names = ['First name', 'Gender', 'Number']) for year in years}
baby_names[2012].head()

In [None]:
# Most popular names in a year
# the names are ordered by fequency in the files

baby_names[1995]['Number'][baby_names[1995]["Gender"] == 'F'].head()

In [None]:
# Functions to get baby names as functions of gender and birth year

def get_value(name, gender, year):
    """Returns the number of babies born in a given year
    with the specified name and gender"""
    try:
        return baby_names[year] \
               [baby_names[year]['Gender'] == gender] \
               ['Number'][name]
    except KeyError:
        return 0 

In [None]:
# number of children with the selected name and  year

Year = 2012
Name = 'Catherine'

name_number = get_value(Name, 'F', 2012)

print(str(name_number) + ' babies named ' + str(Name) + ' were born in ' + str(Year))



In [None]:
# Function that gets the baby names over the years

def get_series(name, gender):
    """Return the series of the baby name over time"""
    return np.array([get_value(name, gender, year) for year in years])

In [None]:
# Autocorrelation function

def autocorr(x):
    """Return the correlation of a time series with itself
    at different lags"""
    correlation = np.correlate(x,x, mode = 'full')
    norm_corr = correlation[int(correlation.size/2):]/max(correlation[int(correlation.size/2):])
    return norm_corr

In [None]:
# Plot of  the time series and autocorrelation
# of baby names

def autocorr_names(names):
    fig, (ts_plot, ac_plot) = plt.subplots(1,2,figsize = (14,10))
    for i in names.keys():
        x = get_series(i, names[i])    
        z = autocorr(x)
        ts_plot.plot(years, x, '-o', label = i)
        ts_plot.set_title("Baby Names") 
        ts_plot.legend(loc = 'best')
        ts_plot.set_xlabel("Years")
        ts_plot.set_ylabel("Number")
        #Autocorrelation
        ac_plot.plot(z, '-', label = i)
        ac_plot.legend(loc = 'best')
        ac_plot.set_xlabel("Lags")
        ac_plot.set_title('Autocorrelation')
        ac_plot.set_ylabel("Correlation")

## In-Class Exercise 1

Use autocorr_names for several boy names and girl names and then explain the resulting plots

In [13]:
# Cross-correlation function

def crosscorr(x,y):
    """Return the cross correlation of 2 time series 
    at different lags"""
    xcorrelation = np.correlate(x,y, mode = 'full')
    norm_xcorr = xcorrelation[int(xcorrelation.size/2):]/max(xcorrelation[int(xcorrelation.size/2):])
    return norm_xcorr

In [14]:
# Plot of  the time series and crosscorrelation
# of baby names

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

def crosscorr_names(names):
    fig, (ts_plot, xc_plot) = plt.subplots(2,1,figsize = (14,10))
    keys = list(names)    
    name1 = keys[0]
    name2 = keys[1]
    name1_series = get_series(name1, names[name1])
    name2_series = get_series(name2, names[name2])
    ts_plot.plot(years, name1_series, '-o', label = name1)
    ts_plot.plot(years, name2_series, '-o', label = name2)
    ts_plot.set_title("Baby Names") 
    ts_plot.legend(loc = 'best')
    ts_plot.set_xlabel("Years")
    ts_plot.set_ylabel("Number")
    #Cross-correlation
    x = scaler.fit_transform(pd.DataFrame(name1_series))
    y = scaler.fit_transform(pd.DataFrame(name2_series))
    x = x[:,0]
    y = y[:,0]
    xc_plot.xcorr(x,y, usevlines=True, maxlags=50, normed=True, lw=2)
    xc_plot.legend(loc = 'best')
    xc_plot.set_xlabel("Lags")
    xc_plot.set_title('Cross - correlation')
    xc_plot.set_ylabel("Correlation")


## In Class Exercise 2

Use crosscorr_names for several boy and girl names and then explain the resulting plots