## Preprocessing

In [0]:
import numpy as np
import pandas as pd

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
# import and preprocess data
url = "abfss://training@sa8451learningdev.dfs.core.windows.net/interpretable_machine_learning/eml_data/USArrests.csv"
USArrests = spark.read.option("header", "true").csv(url).toPandas()
USArrests.rename(columns={'_c0': 'Index'}, inplace=True)
USArrests.set_index("Index", inplace=True)

int_cols = ["Assault", "UrbanPop"]
float_cols = ["Murder", "Rape"]
USArrests[int_cols] = USArrests[int_cols].astype(int)
USArrests[float_cols] = USArrests[float_cols].astype(float)

In [0]:
USArrests.head()

In [0]:
list(USArrests)

In [0]:
USArrests.mean()

In [0]:
USArrests.var()

### Principal Components Analysis

In [0]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [0]:
import warnings
warnings.filterwarnings('ignore')
df = pd.DataFrame(StandardScaler().fit_transform(USArrests))

In [0]:
df.columns = USArrests.columns
df.head()

In [0]:
df.info()

In [0]:
df_mean = pd.DataFrame(df.mean(), columns=['mean'])
df_std = pd.DataFrame(df.std(), columns=['standard deviation'])
df_moments = pd.concat([df_mean, df_std], axis=1)
df_moments

In [0]:
pca = PCA(n_components=4)
pca_data = pca.fit_transform(df)
principalDf = pd.DataFrame(data = pca_data, columns = ['PC1', 'PC2', 'PC3', 'PC4'])
principalDf.head()

In [0]:
principalDf.info()

In [0]:
loadings = pca.components_.T
loadings_df = pd.DataFrame(loadings, index=df.columns, columns=principalDf.columns)
loadings_df

In [0]:
principalDf.shape

In [0]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->', linewidth=2, shrinkA=0, shrinkB=0, shrinkC=0, shrinkD=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.scatter(principalDf['PC1'], principalDf['PC2'], alpha=0.25, s=200, color='green')
plt.title('principal components', fontsize=30, color='m')
plt.xlabel('principal component 1', fontsize=20, color='c')
plt.ylabel('principal component 2', fontsize=20, color='c')
for length, vector in zip(pca.explained_variance_, pca.components_):
    v = vector * 2 * np.sqrt(length)
    arrow = plt.arrow(0, 0, pca.mean_[0] + v[0], pca.mean_[1] + v[1], label='mylabel', 
                      width=0.09, facecolor='orange', edgecolor='orange', alpha=0.5, )

In [0]:
PSTD = np.sqrt(pca.explained_variance_)
PSTD

In [0]:
PEV = pca.explained_variance_
PEV

In [0]:
PVE = pca.explained_variance_ratio_
PVE

In [0]:
plt.xkcd()
plt.figure(figsize=(25, 10))
plt.plot(np.cumsum(PVE), lw=5.0, ls='-.', color='g', marker='o', markersize=15, markerfacecolor='orange')
plt.xlabel('principal component', fontsize=20, color='c')
plt.ylabel('cumulative proportion of variance explained', fontsize=20, color='c')
plt.title('principal components cumulative explained variance', fontsize=30, color='m')