In [None]:
# This code is prepared by Orhan Erdem

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pylab as plt

%matplotlib inline

In [None]:
housing_df=pd.read_csv('BostonHousing.csv')

In [None]:
housing_df.head(10)

In [None]:
housing_df.describe()

In [None]:
housing_df['CRIM'].describe()

In [None]:
housing_df['CRIM'].mean()

In [None]:
print('The mean value of the crime variable is', housing_df['CRIM'].mean().round(2))

In [None]:
housing_df.isnull().sum()

In [None]:
housing_df.corr().round(2)

In [None]:
corr=housing_df.corr().round(1)
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns, annot=True)
plt.show()


# Principal Component Analysis with Cereal Data

In [None]:
#This data is from here: http://lib.stat.cmu.edu/datasets/1993.expo/cereal

In [None]:
cereal_df=pd.read_excel('cereal.xlsx', index_col=0)
cereal_df.head()

In [None]:
cereal_df.describe()

In [None]:
#Before calculating the correlation coefficients, we need to drop the non-numeric columns.
cereal_df.drop(['mfr','Type'],axis=1, inplace=True)

In this context, axis=1 indicates that the operation targets columns, while axis=0 would target rows. To remove multiple columns, such as ‘mfr’ and ‘Type’, the command would be df.drop(['mfr', 'Type'], axis=1, inplace=True). 



The inplace parameter specifies whether the operation should modify the original DataFrame (df) or return a new DataFrame with the changes. When inplace=True, the original DataFrame (df) is modified, and the specified column ('mfr') is dropped from it directly. This means that after the execution of this command, df will no longer have the column 'mfr'. If inplace were set to False (or not specified, as False is the default value), the operation would return a new DataFrame with the column 'mfr' dropped, but the original df would remain unchanged.

In [None]:
corr=cereal_df.corr().round(1)
sns.heatmap(corr,xticklabels=corr.columns,yticklabels=corr.columns,annot=True)
plt.show()

In [None]:
pca=PCA(n_components=1)
pca.fit(preprocessing.scale(cereal_df[['calories','sugar']]))

In [None]:
pcaSummary=pd.DataFrame({'Standard Deviation':np.sqrt(pca.explained_variance_),
                        'Proportion of variance': pca.explained_variance_ratio_,
                        'Cumulative proportion': np.cumsum(pca.explained_variance_ratio_)})
pcaSummary=pcaSummary.transpose()
pcaSummary.columns=['PC1']
pcaSummary.round(4)

In [None]:
scores=pd.DataFrame(pca.transform(cereal_df[['calories','sugar']]),
                   columns=['PC1'])
scores.head()

# PCA with 12 variables

In [None]:
pca2=PCA(n_components=5)
pca2.fit(preprocessing.scale(cereal_df.iloc[:,3:].dropna(axis=0)))

In [None]:
pca2Summary=pd.DataFrame({'Standard Deviation':np.sqrt(pca2.explained_variance_),
                        'Proportion of variance': pca2.explained_variance_ratio_,
                        'Cumulative proportion': np.cumsum(pca2.explained_variance_ratio_)})
pca2Summary=pca2Summary.transpose()
pca2Summary.columns=['PC1','PC2','PC3','PC4', 'PC5']
pca2Summary.round(4)

In [None]:
df_comp=pd.DataFrame(pca2.components_,columns=cereal_df.iloc[:,3:].columns)
df_comp

In [None]:
scores=pd.DataFrame(pca2.transform(cereal_df.iloc[:,3:]),columns=['PC1','PC2','PC3','PC4', 'PC5'])
scores.head()

In [None]:
sns.heatmap(df_comp,cmap='plasma')

# Extra Material

In [None]:
%%capture
fig,axes=plt.subplots(1,2,figsize=(15,6))
ticks=np.arange(pca2.n_components_)+1
ax=axes[0]
ax.plot(ticks,
       pca2.explained_variance_ratio_,
       marker='o')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Proportion of Variance Explained')
ax.set_ylim([0,1])
ax.set_xticks(ticks)
fig

In [None]:
ax=axes[1]
ax.plot(ticks,
       pca2.explained_variance_ratio_.cumsum(),
       marker='o')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Cumulative Proportion of Variance Explained')
ax.set_ylim([0,1])
ax.set_xticks(ticks)
fig