In [1]:
import plotly 
import matplotlib_venn
# import packages
import re, sklearn, warnings, math, pandas as pd, numpy as np
import seaborn as sns, matplotlib.pyplot as plt, matplotlib
import plotly.graph_objs as go, plotly.offline as py, plotly.tools as tls
from matplotlib_venn import venn2
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# import settings
%matplotlib inline
matplotlib.rcParams.update({'font.size': 12})
py.init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')

In [3]:
# read in data
df = pd.read_csv("Life_data_full_cleaned_dataset.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0.1,Unnamed: 0,state,state_ab,city,place,type,lat,lng,ALand,AWater,...,debt_cdf,male_age_mean,male_age_median,female_age_mean,female_age_median,pct_own,married,separated,divorced,single
0,0,Alaska,AK,Unalaska,Unalaska City,City,53.621091,-166.770979,2823180000.0,3101986247,...,0.30304,38.45838,39.25,32.78177,31.91667,0.25053,0.77522,0.03443,0.09802,0.09233
1,1,Alaska,AK,Eagle River,Anchorage,City,61.17425,-149.284329,509234900.0,1859309,...,0.23925,37.26216,39.33333,38.97956,39.66667,0.94989,0.54158,0.00782,0.13575,0.31485
2,2,Alaska,AK,Jber,Anchorage,City,61.284745,-149.653973,270593000.0,66534601,...,1.0,21.96291,22.25,22.20427,23.16667,0.00759,0.57135,0.01,0.01838,0.40027
3,3,Alaska,AK,Anchorage,Point Mackenzie,City,61.22956,-149.893037,2371512.0,9966854,...,0.18899,35.81912,34.0,37.0075,34.0,0.20247,0.50361,0.0,0.21563,0.28076
4,4,Alaska,AK,Anchorage,Anchorage,City,61.217082,-149.767214,1979230.0,0,...,0.43422,34.1311,30.16667,34.96611,31.75,0.56936,0.59349,0.06731,0.08711,0.25209


In [4]:
# del df['Unnamed: 0']
# del df['state']
# del df['state_ab']
# del df['city']
# del df['place']
# del df['type']
# del df['lat']
# del df['lng']

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,state,state_ab,city,place,type,lat,lng,ALand,AWater,...,debt_cdf,male_age_mean,male_age_median,female_age_mean,female_age_median,pct_own,married,separated,divorced,single
0,0,Alaska,AK,Unalaska,Unalaska City,City,53.621091,-166.770979,2823180000.0,3101986247,...,0.30304,38.45838,39.25,32.78177,31.91667,0.25053,0.77522,0.03443,0.09802,0.09233
1,1,Alaska,AK,Eagle River,Anchorage,City,61.17425,-149.284329,509234900.0,1859309,...,0.23925,37.26216,39.33333,38.97956,39.66667,0.94989,0.54158,0.00782,0.13575,0.31485
2,2,Alaska,AK,Jber,Anchorage,City,61.284745,-149.653973,270593000.0,66534601,...,1.0,21.96291,22.25,22.20427,23.16667,0.00759,0.57135,0.01,0.01838,0.40027
3,3,Alaska,AK,Anchorage,Point Mackenzie,City,61.22956,-149.893037,2371512.0,9966854,...,0.18899,35.81912,34.0,37.0075,34.0,0.20247,0.50361,0.0,0.21563,0.28076
4,4,Alaska,AK,Anchorage,Anchorage,City,61.217082,-149.767214,1979230.0,0,...,0.43422,34.1311,30.16667,34.96611,31.75,0.56936,0.59349,0.06731,0.08711,0.25209


In [6]:
# set up fields
pop_data  = ['pop','male_pop','female_pop','male_age_median', 'female_age_median']
marital_status = ['married','separated', 'divorced', 'single']
real_data = ['AWater','ALand']

# create corrilation plot
dat = df[pop_data+real_data+marital_status].dropna()

# Standardize features by removing the mean and scaling to unit variance
x = StandardScaler().fit_transform(dat[pop_data+real_data+marital_status].values)

# perform factor analysis
FA = FactorAnalysis(n_components = 2).fit(x)

# obtain covariance matrix:
loadings = np.matrix(FA.components_); # loading est.
diag_psi = np.matrix(np.diag(FA.noise_variance_)); # diagonal psi
cov = loadings.T * loadings + diag_psi    

# transfomed data and join to our main df:
dat['latent_1'] = FA.transform(dat[pop_data+real_data+marital_status].values).T[0]
dat['latent_2'] = FA.transform(dat[pop_data+real_data+marital_status].values).T[1]
df = df.join(dat[['latent_1','latent_2']])

In [8]:
# look up dictionary for display names
flds = {'pop': 'Population', 'male_pop':'Male Population', 'female_pop':'Female Population', 
        'male_age_median': 'Male Age Median', 'female_age_median': 'Female Age Median',
       'married':'Married','divorced':'Divorced','separated':'Separated', 'single': 'Single',
       'AWater': 'Amount of Water by State', 'ALand': 'Amount of Land by State'};

# Plot constants
C1 = 'rgba(44, 62, 80, 1)'; C2 = 'rgba(44, 62, 80, .2)'
MAX = 300; trace = []; shapes = [];

# create original shape
shapes.append({'type': 'circle','layer':'below','xref': 'x','yref': 'y',
'x0': -1,'y0': -1,'x1': 1,'y1': 1,'fillcolor': 'rgba(44, 62, 80, .35)',
'line': {'color': 'rgba(0, 0, 0,0)'}})


for i in range(MAX):
    shapes.append({'type': 'circle','layer':'below','xref': 'x','yref': 'y',
                   'x0': -i**3/MAX**3,'y0': -i**3/MAX**3,'x1': i**3/MAX**3,
                   'y1': i**3/MAX**3,'fillcolor': 'rgba(250,250,250, .1)',
                   'line': {'color': 'rgba(0, 0, 0,0)'}})

for i in range(loadings.shape[1]):
    col_name = flds[list(dat.columns.values)[i]]
    trace.append(go.Scatter(x = [0,loadings[0,i]],
                            y = [0,loadings[1,i]],
                            line={'width':3},
                            marker = dict(size = 8),
                            name =col_name))

layout = go.Layout(shapes = shapes,width=700,height=700,
                   margin=go.Margin( l=50, r=50, b=100, t=100, pad=4),
         xaxis=dict(zerolinecolor=C2,gridcolor=C2,range=[-1.25,1.25],
         color=C1,title='<b>Latent Factor<sub>1</sub><b>'),
         yaxis=dict(zerolinecolor=C2,gridcolor=C2,range=[-1.25,1.25],
         color=C1,title='<b>Latent Factor<sub>2</sub><b>'),
         font=dict(family='Open Sans', size=14),
         title='<b>Factor Analysis: LF<sub>1</sub> & LF<sub>2</sub></b>')

fig = go.Figure(data=trace, layout=layout)
py.plot(fig, filename='basic-line',
              config={'displayModeBar':False,'showLink': False,
                      'shape':{'layer':'below','hoverinfo':'none'}})


'basic-line.html'