In [1]:
from notebook.services.config import ConfigManager
cm = ConfigManager()
cm.update('livereveal', {
        'width': 1024,
        'height': 768,
        'scroll': True,
})

{'width': 1024, 'height': 768, 'scroll': True}

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt

%matplotlib inline

df = pd.read_csv('gapminder_data.csv')
df.columns = ['country', 'continent', 'year', 
              'life_exp', 'pop', 'gdp_per_cap']
df.head()

Unnamed: 0,country,continent,year,life_exp,pop,gdp_per_cap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [3]:
# BEGIN SOLUTION
alt.Chart(df[df['year'] >= 2000]).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
).properties(
    width=150,
    height=150
).repeat(
    row=['life_exp', 'pop', 'gdp_per_cap'],
    column=['life_exp', 'pop', 'gdp_per_cap']
)
# END SOLUTION


In [5]:
# Take the log of population and gdp_per_capita
df['log_pop'] = np.log10(df['pop'])
df['log_gdp_per_cap'] = np.log10(df['gdp_per_cap'])

# Drop the non-transformed columns

df_logged = df.drop(columns = ['pop', 'gdp_per_cap'])

In [6]:



# BEGIN SOLUTION
alt.Chart(df[df['year'] >= 2000]).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color = "continent"
).properties(
    width=150,
    height=150
).repeat(
    row=['life_exp', 'log_pop', 'log_gdp_per_cap'],
    column=['life_exp', 'log_pop', 'log_gdp_per_cap']
).interactive()
# END SOLUTION


In [7]:
from sklearn import datasets
cars_df = sns.load_dataset("mpg")
cars_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [10]:
corr = cars_df.corr()

# corr.melt()
cor_data = corr.reset_index().melt(id_vars='index')
cor_data['value'] = np.round(cor_data['value'], 2)
cor_data.head()

Unnamed: 0,index,variable,value
0,mpg,mpg,1.0
1,cylinders,mpg,-0.78
2,displacement,mpg,-0.8
3,horsepower,mpg,-0.78
4,weight,mpg,-0.83


https://altair-viz.github.io/gallery/layered_heatmap_text.html

In [11]:
base = alt.Chart(cor_data).encode(
    x='index:O',
    y='variable:O'    
)

# Text layer with correlation labels
# Colors are for easier readability
text = base.mark_text().encode(
    text = 'value',
    color=alt.condition(
        abs(alt.datum.value) > 0.5, 
        alt.value('white'),
        alt.value('black')
    )
)

# The correlation heatmap itself
cor_plot = base.mark_rect().encode(
    color=alt.Color('value:Q', scale=alt.Scale(scheme='redyellowblue', domain=[1, -1]))
)

tmp = cor_plot + text # The '+' means overlaying the text and rect layer
tmp.properties(width=500, height=500)

## PCA on weather data

We'll look at daily temperature data (in tenths of degrees C) from meteorogical observations for nearly 3000 weather stations in the US for the year 2012 provided by NOAA (National Oceanic and Atmospheric Administration).:



In [12]:
def pca(x, num_pcs = None, center=True, scale=True):
    
    if num_pcs is None:
        num_pcs = np.min(x.shape)
        
    ## Rescale so that each feature has mean=0 and Var=1
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler(with_mean=center, with_std=scale)
    
    x = sc.fit_transform(x)
    #x = x / np.sqrt(x.shape[0]-1)
    
    # PCA is just a singular valude decomposition (SVD) on data matrix
    u, d, vt = np.linalg.svd(x, full_matrices=False)  # X = UDV^T
    
    pc = u[:, 0:num_pcs] @ np.diag(d[0:num_pcs]) ## PC is UD

    pve = d**2 / np.sum(d**2) ## PVE 
   
    return({"pc" : pc, "loading" : vt.T, "pve" : pve})


In [13]:
weather_data = pd.read_csv("weather_data.csv")

# convert weather data to fahrenheit
weather_data = 0.18*weather_data + 32
weather_data = weather_data.T

weather_data.iloc[[0, 1, 48, 49]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2801,2802,2803,2804,2805,2806,2807,2808,2809,2810
11,56.84,60.44,81.097143,55.04,50.9,56.3,54.86,56.12,63.32,57.92,...,42.08,37.04,46.04,39.02,91.58,95.9,50.9,63.5,86.0,82.04
16,63.5,61.16,81.097143,50.36,53.96,57.92,48.38,53.06,90.412093,63.5,...,31.1,29.12,39.92,44.06,86.9,89.6,39.92,60.26,88.16,80.96
352,72.5,68.18,70.16,61.88,58.1,82.385,58.1,66.38,72.5,81.406897,...,24.26,11.12,28.22,34.88,92.48,95.0,64.4,66.2,89.06,85.080435
359,63.14,63.68,61.16,62.6,59.0,62.78,51.8,55.4,68.9,66.2,...,27.14,13.1,22.1,26.42,89.06,91.58,23.54,63.68,88.7,82.04


In [16]:
npcs = 10
pca_dict = pca(weather_data, num_pcs=npcs)

pd.DataFrame(pca_dict['pc']).head()
#pca_dict['pve']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-51.888264,4.757847,6.488253,6.503399,1.498827,-10.033422,11.455039,-10.062687,9.665869,-3.090402
1,-66.253619,13.570636,-7.523106,-4.60504,-10.720325,-0.872461,6.329045,-10.292856,-1.27672,-1.523634
2,-64.847297,-1.256051,14.040171,2.530285,-8.533727,13.753562,10.442809,11.189631,4.998141,-4.166126
3,-67.268661,-3.603625,-0.357652,-7.971315,-13.874287,7.117774,15.36487,7.561481,2.017979,4.78577
4,-54.700853,7.047284,2.032487,13.828841,-2.799714,-1.429142,3.535311,2.885356,0.835209,-3.142361


In [17]:
pca_df = pd.DataFrame(pca_dict['pc'], 
                        columns=["PC" + str(i+1) for i in range(npcs)])


print(weather_data.shape)
print(pca_df.shape)
pca_df.head()

(50, 2811)
(50, 10)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-51.888264,4.757847,6.488253,6.503399,1.498827,-10.033422,11.455039,-10.062687,9.665869,-3.090402
1,-66.253619,13.570636,-7.523106,-4.60504,-10.720325,-0.872461,6.329045,-10.292856,-1.27672,-1.523634
2,-64.847297,-1.256051,14.040171,2.530285,-8.533727,13.753562,10.442809,11.189631,4.998141,-4.166126
3,-67.268661,-3.603625,-0.357652,-7.971315,-13.874287,7.117774,15.36487,7.561481,2.017979,4.78577
4,-54.700853,7.047284,2.032487,13.828841,-2.799714,-1.429142,3.535311,2.885356,0.835209,-3.142361


In [19]:
pca_df['day'] = [int(i) for i in weather_data.index.values]
pca_df.head()


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,day
0,-51.888264,4.757847,6.488253,6.503399,1.498827,-10.033422,11.455039,-10.062687,9.665869,-3.090402,11
1,-66.253619,13.570636,-7.523106,-4.60504,-10.720325,-0.872461,6.329045,-10.292856,-1.27672,-1.523634,16
2,-64.847297,-1.256051,14.040171,2.530285,-8.533727,13.753562,10.442809,11.189631,4.998141,-4.166126,18
3,-67.268661,-3.603625,-0.357652,-7.971315,-13.874287,7.117774,15.36487,7.561481,2.017979,4.78577,19
4,-54.700853,7.047284,2.032487,13.828841,-2.799714,-1.429142,3.535311,2.885356,0.835209,-3.142361,27


In [20]:

alt.Chart(pca_df).mark_point().encode(
    x = "day",
    y = "PC1",
)

In [21]:
pve_df = pd.DataFrame(pca_dict['pve'], columns=["Explained Variance"]).reset_index()

alt.Chart(pve_df).mark_bar(width=10).encode(
    x = 'index',
    y='Explained Variance'
)


In [22]:
alt.Chart(pca_df).mark_point().encode(
    x = "day",
    y = "PC2",
)

n = day of the year, p = 3000 weather stations across the US.

Why did I structure my data that way?

n = 3000 weather stations, and p is day of the year.  Why not do this?


In [None]:

alt.Chart(pca_df).mark_point().encode(
    x = "PC1",
    y = "PC2",
    color=alt.Color("day:Q", scale=alt.Scale(scheme='spectral'))
)

## What are the features and what are the observations?

Why should days be "observations" and stations be "variables"?

What would happen if we transposed the matrix and then rain PCA?  This time we are doing dimension reduciton on the stations with the day of the day as the feature.

In [24]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


## First standardize the data
#weather_scaled = (StandardScaler(with_mean=True, with_std=True)
#                  .fit_transform(weather_data.T))
pca_dict = pca(weather_data.T, num_pcs=npcs)
pca_df = pd.DataFrame(pca_dict['pc'], 
                      columns=["PC" + str(i+1) for i in range(10)])


print(weather_data.T.shape)
print(pca_df.shape)
pca_df.head()

(2811, 50)
(2811, 10)


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-10.078964,2.441923,-0.549752,-1.75836,1.028389,-0.473659,0.22557,-0.483711,0.450994,-0.038312
1,-4.542554,-1.220554,-0.749874,-1.208144,0.393133,-0.454902,-0.403074,-0.096968,-0.079035,-0.009012
2,-7.027061,2.156477,-0.889231,-1.175112,-0.739677,-1.069822,-1.319947,0.018745,0.332442,-0.644729
3,-5.542014,3.077907,-0.464565,-1.719175,0.285435,-0.264357,-0.837573,-0.176203,-0.106717,-1.208483
4,-6.301912,1.290459,0.235218,-0.953825,0.810762,-0.300342,-0.637082,0.048128,-0.830428,-0.314624


In [None]:
Want to know the weather, every day of the year, at a particular station.  
What piece(s) of information do you want to know?

In [27]:
alt.Chart(pca_df.reset_index()).mark_point(size=1).encode(
    x="PC1",
    y="PC2",
).interactive()
weather_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2801,2802,2803,2804,2805,2806,2807,2808,2809,2810
11,56.84,60.44,81.097143,55.04,50.9,56.3,54.86,56.12,63.32,57.92,...,42.08,37.04,46.04,39.02,91.58,95.9,50.9,63.5,86.0,82.04
16,63.5,61.16,81.097143,50.36,53.96,57.92,48.38,53.06,90.412093,63.5,...,31.1,29.12,39.92,44.06,86.9,89.6,39.92,60.26,88.16,80.96
18,63.14,59.72,59.36,50.0,49.46,59.36,41.0,43.16,90.412093,81.406897,...,27.14,-5.8,26.96,36.14,90.5,94.28,30.92,77.0,88.52,80.96
19,61.52,60.62,61.34,51.98,79.442857,54.86,42.8,44.6,59.54,51.98,...,6.26,-2.74,12.02,14.54,88.34,93.02,33.26,74.12,88.34,80.06
27,71.24,61.7,61.34,54.5,59.9,55.22,52.16,54.86,66.92,81.406897,...,37.04,32.0,37.04,45.14,91.04,93.2,42.8,53.06,88.88,78.98



<br>
<br>
<br>
<br>
<br>















Hunch: the station number probably carriers some important geographic information.  Lets set the color to the index.

In [32]:
alt.Chart(pca_df.reset_index()).mark_point(size=1).encode(
    x="PC1",
    y="PC2",
    color=alt.Color("index:Q", scale=alt.Scale(scheme = "redyellowblue"))
).interactive()

In [30]:
pve_df = pd.DataFrame(pca_dict['pve'], columns=["Explained Variance"]).reset_index()

alt.Chart(pve_df).mark_line().encode(
    x = 'index',
    y='Explained Variance'
).interactive()