***
***
***
***
***
<br><h1>A3: Timed Unsupervised Learning Project</h1>
<h2> Machine Learning </h2><br><br>
Hult International Business School<br><br><br>

***
***
***
***
***

<strong>Importing the necessary python packages that are needed for the analysis</strong>

In [None]:
import pandas                as pd                      # data science essentials
import matplotlib.pyplot     as plt                     # fundamental data visualization
import seaborn               as sns                     # enhanced visualization
from sklearn.preprocessing   import StandardScaler      # standard scaler
from sklearn.decomposition   import PCA                 # pca
from scipy.cluster.hierarchy import dendrogram, linkage # dendrograms
from sklearn.cluster         import KMeans              # k-means clustering

<br><br><br><strong>Running the User defined functions that will be used throughout our analysis</strong>

In [None]:
########################################
# scree_plot
########################################
def scree_plot(pca_object, export = False):
    # building a scree plot

    # setting plot size
    fig, ax = plt.subplots(figsize=(10, 8))
    features = range(pca_object.n_components_)


    # developing a scree plot
    plt.plot(features,
             pca_object.explained_variance_ratio_,
             linewidth = 2,
             marker = 'o',
             markersize = 10,
             markeredgecolor = 'black',
             markerfacecolor = 'grey')


    # setting more plot options
    plt.title('Scree Plot')
    plt.xlabel('PCA feature')
    plt.ylabel('Explained Variance')
    plt.xticks(features)

    if export == True:
    
        # exporting the plot
        plt.savefig('scree_plot.png')
        
    # displaying the plot
    plt.show()

    
########################################
# inertia plot
########################################
def inertia_plot(data, max_clust = 50):
    """
PARAMETERS
----------
data      : DataFrame, data from which to build clusters. Dataset should be scaled
max_clust : int, maximum of range for how many clusters to check interia, default 50
    """

    ks = range(1, max_clust)
    inertias = []


    for k in ks:
        # INSTANTIATING a kmeans object
        model = KMeans(n_clusters = k)


        # FITTING to the data
        model.fit(data)


        # append each inertia to the list of inertias
        inertias.append(model.inertia_)



    # plotting ks vs inertias
    fig, ax = plt.subplots(figsize = (12, 8))
    plt.plot(ks, inertias, '-o')


    # labeling and displaying the plot
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.show()

<br><br><br>
***
***
***
<h3>Explanatory Data Analysis</h3>

***
***
***

<br><br><br><strong>Loading the original dataset</strong>

In [None]:
original_df = pd.read_excel('Survey_Data_Final_Exam.xlsx')

#Viewing the dataset 
original_df

In [None]:
#Finding the info of the dataframe
original_df.info() 



In [None]:
#Checking to see if any columns have missing values and summing them up to find the total of missing values
original_df.isnull().any().sum()


In [None]:
#Looking at the value that is missing 
original_df['What is your ethnicity?'][original_df['What is your ethnicity?'].isnull()]


In [None]:
#Looking at all the other columns of the missing row
print(original_df.iloc[147, :])


<strong>Question: </strong>How is this guy 45 years old? Fake observation or someone just playing around?

In [None]:
#Filling the missing value 
original_df.fillna('Unanswered', inplace = True)


In [None]:
#Rechecking if any missing value exists
original_df.isnull().any().sum()


In [None]:
#Finding the descriptive statistics of the dataset
original_df.loc[:, :].describe().round(2)


In [None]:
#Renaming the columns for easier identificaition and usage
original_df.columns = ['surveyID', 
                       'life_of_the_party', 
                       'unconcerned_for_others', 
                       'always_prepared', 
                       'stressed_out_easily', 
                       'have_rich_vocabulary', 
                       'less_talkative', 
                       'interested_in_people', 
                       'leave_belongings_around', 
                       'relaxed', 
                       'difficulty_understanding_abstract_ideas', 
                       'comfortable_around_people', 
                       'insult_people', 
                       'attention_to_detail',
                       'worrisome', 
                       'vivid_imagination', 
                       'keep_in_the_background', 
                       'sympathetic', 
                       'messy_person', 
                       'seldom_feel_blue', 
                       'uninterested_in_abstract_ideas', 
                       'starts_conversations', 
                       'uninterested_in_peoples_problems', 
                       'get_chores_done', 
                       'easily_disturbed', 
                       'have_excellent_ideas', 
                       'have_little_to_say', 
                       'soft_hearted', 
                       'forget_to_put_things_back', 
                       'get_upset_easily', 
                       'no_good_imagination', 
                       'socialize_at_parties', 
                       'uninterested_in_others', 
                       'like_order', 
                       'moody', 
                       'quick_at_understanding_things', 
                       'averts_attention_to_self', 
                       'take_time_out_for_others', 
                       'avoids_duties', 
                       'mood_swings',
                       'use_difficult_words', 
                       'likes_being_center_of_attention', 
                       'feel_others_emotions', 
                       'follows_schedule', 
                       'irritated_easily', 
                       'reflects_on_things', 
                       'quite_around_strangers', 
                       'make_people_feel_at_ease', 
                       'exacting_at_work', 
                       'feels_blue', 
                       'full_of_ideas', 
                       'sees_underlying_patters_in_complex_situations', 
                       'dont_generate_new_ideas',
                       'aware_of_personal_strengths/weakness', 
                       'growth_mindset', 
                       'responds_effectively_to_priorities', 
                       'takes_initiative', 
                       'encourages_direct/open_discussions', 
                       'responds_effectively_to_priorities(duplicate)', 
                       'takes_initiative(duplicate)', 
                       'encourages_direct/open_discussions(duplicate)', 
                       'good_listener', 
                       'dont_persuasively_sell_vision/idea', 
                       'builds_cooperative_relationships', 
                       'works_well_with_people_from_diverse_cultures', 
                       'effectively_negotiates_interests/resources/roles', 
                       'cant_rally_team_towards_common_goal', 
                       'translates_ideas_into_plans_that_are_organized/realistic', 
                       'resolves_conflicts_constructively', 
                       'seeks/uses_teammates_feedback', 
                       'coaches_teammates_for_performance/growth', 
                       'drive_for_results', 
                       'current_laptop', 
                       'preferred_laptop', 
                       'program', 
                       'age', 
                       'gender',
                       'nationality', 
                       'ethnicity']

In [None]:
#Finding the value counts of categorical variables
print(original_df.loc[:, 'current_laptop'].value_counts())
print("\n")
print(original_df.loc[:, 'preferred_laptop'].value_counts())
print("\n")
print(original_df.loc[:, 'program'].value_counts())
print("\n")
print(original_df.loc[:, 'gender'].value_counts())
print("\n")
print(original_df.loc[:, 'nationality'].value_counts())
print("\n")
print(original_df.loc[:, 'ethnicity'].value_counts())


In [None]:
#Analysis of current laptop brand per answer

current_laptop = original_df.loc[:, 'current_laptop']

current_windows    = 0
current_macbook    = 0
current_chromebook = 0
current_total      = 0

for brand in current_laptop:
    if brand == "Windows laptop":
        current_windows += 1
        current_total += 1
    
    elif brand == 'Chromebook':
        current_chromebook += 1
        current_total += 1
        
    else:
        current_macbook += 1
        current_total += 1

perc_windows = round((current_windows/current_total) * 100, 1)
perc_macbook = round((current_macbook/current_total) * 100, 1)
perc_chromebook = round((current_chromebook/current_total) * 100, 1)

print(f"""
        Total computers:           {current_total}
        Total current Windows:     {current_windows}, a {perc_windows}% of the total.
        Total current Macbooks:    {current_macbook}, a {perc_macbook}% of the total.
        Total current Chromebooks: {current_chromebook}, a {perc_chromebook}% of the total.
        _____________________________________________________
        """)


In [None]:
#Analysis of preferred laptop brand per answer

preferred_laptop_laptop = original_df.loc[:, 'preferred_laptop']

preferred_laptop_windows    = 0
preferred_laptop_macbook    = 0
preferred_laptop_chromebook = 0
preferred_laptop_total      = 0

for brand in preferred_laptop_laptop:
    if brand == "Windows laptop":
        preferred_laptop_windows += 1
        preferred_laptop_total += 1
    
    elif brand == 'Chromebook':
        preferred_laptop_chromebook += 1
        preferred_laptop_total += 1
        
    else:
        preferred_laptop_macbook += 1
        preferred_laptop_total += 1

diff_wind = preferred_laptop_windows - current_windows
diff_mac = preferred_laptop_macbook - current_macbook

change_windows = round(((preferred_laptop_windows - current_windows)/current_windows) * 100, 1)
change_macbook = round(((preferred_laptop_macbook - current_macbook)/current_macbook) * 100, 1)
#change_chromebook = round((preferred_laptop_chromebook/target_total) * 100, 1)


print(f"""
        Total computers:             {preferred_laptop_total}
        Total preferred Windows:     {preferred_laptop_windows}, a change of {diff_wind} units ({change_windows}%).
        Total preferred Macbooks:    {preferred_laptop_macbook}, a change of {diff_mac} units ({change_macbook}%).
        Total preferred Chromebooks:  {preferred_laptop_chromebook}, a new player!
        _____________________________________________________
        """)


<strong>INSIGHT 1:</strong><br>
When it comes to branding and operating system, the sample is evenly distributed. In this sense, 51% of the survey owns a Macbook (200 people), and 49% have a Windows laptop. However, in the context of all devices having the same retail price, the distribution would change as follows: Macbook would increase by 19 units (56% of total), Windows would decrease by 29 units (41.5% of total), and Chromebook would have 10 units (2.5%). Note that although Macbook increased in total units, it would lose 16 users to Windows, and 3 users to Chromebook.

<br><br><br><strong>To maintain the similarity between features as required in Unsupervised learning, dropping variables that do not match with the survey questions, where people gave a rating between 1 and 5</strong>

In [None]:
survey_answers = original_df.drop(['surveyID', 'current_laptop', 'preferred_laptop', 'program', 'age', 'gender',
                                   'nationality', 'ethnicity'], axis = 1)

#Viewing the new dataset
survey_answers


<br><br><strong>Separating columns that relate to Big 5 personality traits and Hult DNA.</strong>


In [None]:
#Subsetting the Big5 personality traits
survey_answers_big5 =   survey_answers.loc[:, ['life_of_the_party', 
                                               'unconcerned_for_others', 
                                               'always_prepared', 
                                               'stressed_out_easily', 
                                               'have_rich_vocabulary', 
                                               'less_talkative', 
                                               'interested_in_people', 
                                               'leave_belongings_around', 
                                               'relaxed', 
                                               'difficulty_understanding_abstract_ideas', 
                                               'comfortable_around_people', 
                                               'insult_people', 
                                               'attention_to_detail',
                                               'worrisome', 
                                               'vivid_imagination', 
                                               'keep_in_the_background', 
                                               'sympathetic', 
                                               'messy_person', 
                                               'seldom_feel_blue', 
                                               'uninterested_in_abstract_ideas', 
                                               'starts_conversations', 
                                               'uninterested_in_peoples_problems', 
                                               'get_chores_done', 
                                               'easily_disturbed', 
                                               'have_excellent_ideas', 
                                               'have_little_to_say', 
                                               'soft_hearted', 
                                               'forget_to_put_things_back', 
                                               'get_upset_easily', 
                                               'no_good_imagination', 
                                               'socialize_at_parties', 
                                               'uninterested_in_others', 
                                               'like_order', 
                                               'moody', 
                                               'quick_at_understanding_things', 
                                               'averts_attention_to_self', 
                                               'take_time_out_for_others', 
                                               'avoids_duties', 
                                               'mood_swings',
                                               'use_difficult_words', 
                                               'likes_being_center_of_attention', 
                                               'feel_others_emotions', 
                                               'follows_schedule', 
                                               'irritated_easily', 
                                               'reflects_on_things', 
                                               'quite_around_strangers', 
                                               'make_people_feel_at_ease', 
                                               'exacting_at_work', 
                                               'feels_blue', 
                                               'full_of_ideas']]

#Checking the results
survey_answers_big5


In [None]:
#Choosing columns with Hult DNA
survey_answers_hultdna = survey_answers.loc[:,['sees_underlying_patters_in_complex_situations', 
                                               'dont_generate_new_ideas',
                                               'aware_of_personal_strengths/weakness', 
                                               'growth_mindset', 
                                               'responds_effectively_to_priorities', 
                                               'takes_initiative', 
                                               'encourages_direct/open_discussions', 
                                               'responds_effectively_to_priorities(duplicate)', 
                                               'takes_initiative(duplicate)', 
                                               'encourages_direct/open_discussions(duplicate)', 
                                               'good_listener', 
                                               'dont_persuasively_sell_vision/idea', 
                                               'builds_cooperative_relationships', 
                                               'works_well_with_people_from_diverse_cultures', 
                                               'effectively_negotiates_interests/resources/roles', 
                                               'cant_rally_team_towards_common_goal', 
                                               'translates_ideas_into_plans_that_are_organized/realistic', 
                                               'resolves_conflicts_constructively', 
                                               'seeks/uses_teammates_feedback', 
                                               'coaches_teammates_for_performance/growth', 
                                               'drive_for_results']]

#Checking the dataset
survey_answers_hultdna


In [None]:
#print(survey_answers_hultdna['responds_effectively_to_priorities'].corr(survey_answers_hultdna['responds_effectively_to_priorities(duplicate)']))
#print("\n")
#print(survey_answers_hultdna['takes_initiative'].corr(survey_answers_hultdna['takes_initiative(duplicate)']))
#print("\n")
#print(survey_answers_hultdna['encourages_direct/open_discussions'].corr(survey_answers_hultdna['encourages_direct/open_discussions(duplicate)']))

<strong>Note:</strong> There are 3 repeated questions, labeled as (duplicates). We will use them as is, since they look highly correlated and the PCA algorithm works fine with highly correlated explanatory variables.

<br><br><br><strong>Scaling the explanatory variables, so that they can be used in PCA and KNN clustering methods</strong>

<h4>Scaling the whole survey_answers explanatory variables</h4>

In [None]:
#Instantiating a StandartScaler object
scaler = StandardScaler()

#Fitting the scaler object to the explanatory data
scaler.fit(survey_answers)

#Transforming the fit data
survey_answers_scaled = scaler.transform(survey_answers)

#Converting into a dataframe object
survey_answers_scaled_df = pd.DataFrame(survey_answers_scaled)

#Renaming the columns 
survey_answers_scaled_df.columns = survey_answers.columns

#Checking the variance before and after scaling
print(pd.np.var(survey_answers))
print("\n")
print(pd.np.var(survey_answers_scaled_df))


<h4>Scaling the Big5 personality traits explanatory variables</h4>

In [None]:
#Instantiating a StandartScaler object
scaler = StandardScaler()

#Fitting the scaler object to the explanatory data
scaler.fit(survey_answers_big5)

#Transforming the fit data
survey_answers_big5_scaled = scaler.transform(survey_answers_big5)

#Converting into a dataframe object
survey_answers_big5_scaled_df = pd.DataFrame(survey_answers_big5_scaled)

#Renaming the columns 
survey_answers_big5_scaled_df.columns = survey_answers_big5.columns

#Checking the variance before and after scaling
print(pd.np.var(survey_answers_big5))
print("\n")
print(pd.np.var(survey_answers_big5_scaled_df))


<h4>Scaling the Hult DNA explanatory variables</h4>

In [None]:
#Instantiating a StandartScaler object
scaler = StandardScaler()

#Fitting the scaler object to the explanatory data
scaler.fit(survey_answers_hultdna)

#Transforming the fit data
survey_answers_hultdna_scaled = scaler.transform(survey_answers_hultdna)

#Converting into a dataframe object
survey_answers_hultdna_scaled_df = pd.DataFrame(survey_answers_hultdna_scaled)

#Renaming the columns 
survey_answers_hultdna_scaled_df.columns = survey_answers_hultdna.columns

#Checking the variance before and after scaling
print(pd.np.var(survey_answers_hultdna))
print("\n")
print(pd.np.var(survey_answers_hultdna_scaled_df))

<br><br><br><strong>Creating a correlation heatmap</strong>

In [None]:
#Setting plot size
#fig, ax = plt.subplots(figsize = (12, 12))


#Developing a correlation matrix object
#survey_answers_scaled_df_corr = survey_answers_scaled_df.corr().round(2).iloc[0:19, 0:19]


#Creating a correlation heatmap
#sns.heatmap(survey_answers_scaled_df_corr,
#            cmap   = 'coolwarm',
#            square = True,
#            annot  = True)

#Display the heatmap
#plt.show()

In [None]:
#Setting plot size
#fig, ax = plt.subplots(figsize = (12, 12))


#Developing a correlation matrix object
#survey_answers_scaled_df_corr = survey_answers_scaled_df.corr().round(2).iloc[20:39, 20:39]


#Creating a correlation heatmap
#sns.heatmap(survey_answers_scaled_df_corr,
#            cmap   = 'coolwarm',
#            square = True,
#            annot  = True)

#Display the heatmap
#plt.show()

In [None]:
#Setting plot size
#fig, ax = plt.subplots(figsize = (12, 12))


#Developing a correlation matrix object
#survey_answers_scaled_df_corr = survey_answers_scaled_df.corr().round(2).iloc[40:59, 40:59]


#Creating a correlation heatmap
#sns.heatmap(survey_answers_scaled_df_corr,
#            cmap   = 'coolwarm',
#            square = True,
#            annot  = True)

#Display the heatmap
#plt.show()

In [None]:
#Setting plot size
#fig, ax = plt.subplots(figsize = (12, 12))


#Developing a correlation matrix object
#survey_answers_scaled_df_corr = survey_answers_scaled_df.corr().round(2).iloc[60:71, 60:71]


#Creating a correlation heatmap
#sns.heatmap(survey_answers_scaled_df_corr,
#            cmap   = 'coolwarm',
#            square = True,
#            annot  = True)

#Display the heatmap
#plt.show()

<strong>As can be seen from the correlation heatmaps, there are only a few correlations that are above 0.5 or below -0.5. Implying that we could explain high degree of variance using only a few Principal Components in PCA.</strong>

<br><br><br>
***
***
***
<h3>Principal Component Analysis</h3>

***
***
***

<h4>PCA performed on the whole survey_answers_scaled explanatory variables</h4>

In [None]:
#Instantiating a PCA object without specific mention of components
#pca = PCA(n_components = None,
#          random_state = 222)

#Fitting and Transforming the scaled survey data
#pca_survey = pca.fit_transform(survey_answers_scaled)

#Comparing shapes
#print("Original shape:", survey_answers_scaled.shape)
#print("PCA shape     :", pca_survey.shape)


<br><br><br><strong>Plotting a Scree plot to visually detect the number of principal components to be used.</strong>

In [None]:
#Calling the scree plot function
#scree_plot(pca_object=pca)


<strong>Based on the Scree plot, it is a good choice to go with 7 Principal Components, as seen by the elbow of the plot at 7 PCA feature, after which there is very less change in explained variance.</strong>

In [None]:
#Instanting a PCA object with just the first 7 PC's
#pca7 = PCA(n_components = 7,
#           random_state = 222)

#Fitting and Transforming the scaled survey data
#pca7_survey = pca7.fit_transform(survey_answers_scaled)


<br><br><strong>Checking each components explained Variance ratio</strong>

In [None]:
#Component number counter
#component_number = 0


#Looping over each principal component
#for variance in pca7.explained_variance_ratio_:
#    component_number += 1
#    print(f"PC {component_number} : {variance.round(3)}")


<br><br><br><strong>Understanding the meaning of each principal component by analyzing its factor loading</strong>

In [None]:
#Transposing pca components
#factor_loadings_df = pd.DataFrame(pd.np.transpose(pca7.components_))


#Naming rows as original features
#factor_loadings_df = factor_loadings_df.set_index(survey_answers_scaled_df.columns)


#Checking the result
#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 1
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,0][factor_loadings_df.iloc[:,0]>0.1].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,0][factor_loadings_df.iloc[:,0]<-0.18].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 2
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,1][factor_loadings_df.iloc[:,1]>0.25].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,1][factor_loadings_df.iloc[:,1]<-0.07].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 3
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,2][factor_loadings_df.iloc[:,2]>0.15].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,2][factor_loadings_df.iloc[:,2]<-0.16].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 4
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,3][factor_loadings_df.iloc[:,3]>0.19].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,3][factor_loadings_df.iloc[:,3]<-0.16].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 5
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,4][factor_loadings_df.iloc[:,4]>0.19].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,4][factor_loadings_df.iloc[:,4]<-0.11].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 6
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,5][factor_loadings_df.iloc[:,5]>0.18].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,5][factor_loadings_df.iloc[:,5]<-0.14].sort_values(ascending = True))
#print("\n\n")

#print(f"""
#---------------------------------------
#FACTOR LOADING: PRINCIPAL COMPONENT 7
#---------------------------------------""")
#print(factor_loadings_df.iloc[:,6][factor_loadings_df.iloc[:,6]>0.18].sort_values(ascending = False))
#print("\n")
#print(factor_loadings_df.iloc[:,6][factor_loadings_df.iloc[:,6]<-0.14].sort_values(ascending = True))
#print("\n\n")


In [None]:
#Naming the Principal Components based on the factor loadings 
#factor_loadings_df.columns = ['uninterested_folks',
#                              'moody_folks',
#                              'Introverts',
#                              'people-centric/sympathetic_folks',
#                              'relaxed/self-centric_folks',
#                              'unimaginative_folks',
#                              'commanding_speakers']

#factor_loadings_df


<br><br><br><strong>Analyzing how each surveyor fits into each of the Principal Components</strong>

In [None]:
#Converting the earlier fit and transformed pca7 object into a Dataframe
#pca7_survey_df = pd.DataFrame(pca7_survey)

#Renaming columns
#pca7_survey_df.columns = factor_loadings_df.columns

#Displaying the results
#pca7_survey_df


<br><br><br><h4>PCA performed on the survey_answers_big5_scaled explanatory variables</h4>

In [None]:
#Instantiating a PCA object without specific mention of components
pca = PCA(n_components = None,
          random_state = 222)

#Fitting and Transforming the scaled survey data
pca_survey_big5 = pca.fit_transform(survey_answers_big5_scaled)

#Comparing shapes
print("Original shape:", survey_answers_big5_scaled.shape)
print("PCA shape     :", pca_survey_big5.shape)


<br><br><br><strong>Plotting a Scree plot to visually detect the number of principal components to be used.</strong>

In [None]:
#Calling the scree plot function
scree_plot(pca_object=pca)


<strong>Based on the Scree plot, it is a good choice to go with 7 Principal Components, as seen by the elbow of the plot at 7 PCA feature, after which there is very less change in explained variance.</strong>

In [None]:
#Instanting a PCA object with just the first 7 PC's
pca7 = PCA(n_components = 7,
           random_state = 222)

#Fitting and Transforming the scaled survey data
pca7_survey_big5 = pca7.fit_transform(survey_answers_big5_scaled)


<br><br><strong>Checking each components explained Variance ratio</strong>

In [None]:
#Component number counter
component_number = 0


#Looping over each principal component
for variance in pca7.explained_variance_ratio_:
    component_number += 1
    print(f"PC {component_number} : {variance.round(3)}")
    

<br><br><br><strong>Understanding the meaning of each principal component by analyzing its factor loading</strong>

In [None]:
#Transposing pca components
factor_loadings_big5_df = pd.DataFrame(pd.np.transpose(pca7.components_))


#Naming rows as original features
factor_loadings_big5_df = factor_loadings_big5_df.set_index(survey_answers_big5_scaled_df.columns)


#Checking the result
print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 1
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,0][factor_loadings_big5_df.iloc[:,0]>0.16].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,0][factor_loadings_big5_df.iloc[:,0]<-0.17].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 2
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,1][factor_loadings_big5_df.iloc[:,1]>0.25].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,1][factor_loadings_big5_df.iloc[:,1]<-0.07].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 3
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,2][factor_loadings_big5_df.iloc[:,2]>0.15].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,2][factor_loadings_big5_df.iloc[:,2]<-0.16].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 4
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,3][factor_loadings_big5_df.iloc[:,3]>0.19].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,3][factor_loadings_big5_df.iloc[:,3]<-0.16].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 5
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,4][factor_loadings_big5_df.iloc[:,4]>0.19].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,4][factor_loadings_big5_df.iloc[:,4]<-0.11].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 6
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,5][factor_loadings_big5_df.iloc[:,5]>0.18].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,5][factor_loadings_big5_df.iloc[:,5]<-0.1].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 7
---------------------------------------""")
print(factor_loadings_big5_df.iloc[:,6][factor_loadings_big5_df.iloc[:,6]>0.18].sort_values(ascending = False))
print("\n")
print(factor_loadings_big5_df.iloc[:,6][factor_loadings_big5_df.iloc[:,6]<-0.14].sort_values(ascending = True))
print("\n\n")


In [None]:
#Naming the columns based on the factor loadings
factor_loadings_big5_df.columns = ['prefer_isolation',
                                   'moody',
                                   'non_party_person',
                                   'strong_verbal_aptitude',
                                   'relaxed',
                                   'systematic',
                                   'creative']

factor_loadings_big5_df

<br><br><br><strong>Analyzing how each surveyor fits into each of the Principal Components</strong>

In [None]:
#Converting the earlier fit and transformed pca7 object into a Dataframe
pca7_survey_big5_df = pd.DataFrame(pca7_survey_big5)

#Renaming columns
pca7_survey_big5_df.columns = factor_loadings_big5_df.columns

#Displaying the results
pca7_survey_big5_df

<br><br><br><h4>PCA performed on the survey_answers_hultdna_scaled explanatory variables</h4>

In [None]:
#Instantiating a PCA object without specific mention of components
pca = PCA(n_components = None,
          random_state = 206)

#Fitting and Transforming the scaled survey data
pca_survey_hultdna = pca.fit_transform(survey_answers_hultdna_scaled)

#Comparing shapes
print("Original shape:", survey_answers_hultdna_scaled.shape)
print("PCA shape     :", pca_survey_hultdna.shape)

<br><br><br><strong>Plotting a Scree plot to visually detect the number of principal components to be used.</strong>

In [None]:
#Calling the scree plot function
scree_plot(pca_object=pca)

<strong>Based on the Scree plot, it is a good choice to go with 4 Principal Components, as seen by the elbow of the plot at 4 PCA feature, after which there is very less change in explained variance.</strong>

In [None]:
#Instanting a PCA object with just the first 4 PC's
pca4 = PCA(n_components = 4,
           random_state = 222)

#Fitting and Transforming the scaled survey data
pca4_survey_hultdna = pca4.fit_transform(survey_answers_hultdna_scaled)

<br><br><strong>Checking each components explained Variance ratio</strong>

In [None]:
#Component number counter
component_number = 0


#Looping over each principal component
for variance in pca4.explained_variance_ratio_:
    component_number += 1
    print(f"PC {component_number} : {variance.round(3)}")

<br><br><br><strong>Understanding the meaning of each principal component by analyzing its factor loading</strong>

In [None]:
#Transposing pca components
factor_loadings_hultdna_df = pd.DataFrame(pd.np.transpose(pca4.components_))


#Naming rows as original features
factor_loadings_hultdna_df = factor_loadings_hultdna_df.set_index(survey_answers_hultdna_scaled_df.columns)


#Checking the result
print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 1
---------------------------------------""")
print(factor_loadings_hultdna_df.iloc[:,0][factor_loadings_hultdna_df.iloc[:,0] > 0.05].sort_values(ascending = False))
print("\n")
print(factor_loadings_hultdna_df.iloc[:,0][factor_loadings_hultdna_df.iloc[:,0] < -0.23].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 2
---------------------------------------""")
print(factor_loadings_hultdna_df.iloc[:,1][factor_loadings_hultdna_df.iloc[:,1]>0.25].sort_values(ascending = False))
print("\n")
print(factor_loadings_hultdna_df.iloc[:,1][factor_loadings_hultdna_df.iloc[:,1]<-0.07].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 3
---------------------------------------""")
print(factor_loadings_hultdna_df.iloc[:,2][factor_loadings_hultdna_df.iloc[:,2]>0.15].sort_values(ascending = False))
print("\n")
print(factor_loadings_hultdna_df.iloc[:,2][factor_loadings_hultdna_df.iloc[:,2]<-0.1].sort_values(ascending = True))
print("\n\n")

print(f"""
---------------------------------------
FACTOR LOADING: PRINCIPAL COMPONENT 4
---------------------------------------""")
print(factor_loadings_hultdna_df.iloc[:,3][factor_loadings_hultdna_df.iloc[:,3]>0.19].sort_values(ascending = False))
print("\n")
print(factor_loadings_hultdna_df.iloc[:,3][factor_loadings_hultdna_df.iloc[:,3]<-0.16].sort_values(ascending = True))
print("\n\n")

In [None]:
factor_loadings_hultdna_df.columns = ['not_so_bright',
                                      'growth_mindset',
                                      'unconvincing',
                                      'inefficient']

factor_loadings_hultdna_df

<br><br><br><strong>Analyzing how each surveyor fits into each of the Principal Components</strong>

In [None]:
#Converting the earlier fit and transformed pca7 object into a Dataframe
pca4_survey_hultdna_df = pd.DataFrame(pca4_survey_hultdna)

#Renaming columns
pca4_survey_hultdna_df.columns = factor_loadings_hultdna_df.columns

#Displaying the results
pca4_survey_hultdna_df

<br><br><br>
***
***
***
<h3>Clustering</h3>

***
***
***

<h4>Clustering using the chosen 7 Principal components of the survey answers explanatory variables</h4>

In [None]:
#Checking the variance of the pca7_survey object
#pd.np.var(pca7_survey_df)

<strong>Rescaling our data, as the variance among features is no longer same in the new dataset.</strong>

In [None]:
#Instantiating a Standard Scaler object
#scaler = StandardScaler()

#Fitting the scaler object to the dataset
#scaler.fit(pca7_survey_df)

#Transforming the fit data
#pca7_survey_scaled = scaler.transform(pca7_survey_df)

#Converting it into a dataframe
#pca7_survey_scaled_df = pd.DataFrame(pca7_survey_scaled)

#Renaming the columns
#pca7_survey_scaled_df.columns = factor_loadings_df.columns

#Checking the variance
#pd.np.var(pca7_survey_scaled_df)

<br><br><strong>Creating a dendogram to determine the number of cluster for KNN modeling</strong>

In [None]:
#Grouping data based on Ward distance
#standard_mergings_ward = linkage(y = pca7_survey_scaled_df,
#                                 method = 'ward')


#Setting plot size
#fig, ax = plt.subplots(figsize=(12, 12))

#Developing a dendrogram
#dendrogram(Z = standard_mergings_ward,
#           leaf_rotation = 90,
#           leaf_font_size = 6)

#plt.show()

<strong>Plotting an inertia plot to see viable candidates for determining the number of clusters to be used in KMeans modeling</strong>

In [None]:
#Calling the inertia_plot function
#inertia_plot(data = pca7_survey_scaled_df)

<strong>Based on the inertia plot, lets use 5 clusters for building the KMeans model</strong>

In [None]:
#INSTANTIATING a k-Means object with five clusters
#survey_answers_kmeans_pca = KMeans(n_clusters = 5,
#                              random_state = 222)


#Fitting the object to the data
#survey_answers_kmeans_pca.fit(pca7_survey_scaled_df)


#Converting the clusters to a DataFrame
#survey_answers_kmeans_pca_df = pd.DataFrame({'Cluster': survey_answers_kmeans_pca.labels_})


#Checking the results
#print(survey_answers_kmeans_pca_df.iloc[: , 0].value_counts())

<strong>Finding the centroids of each cluster, in an attempt to explain the underlying story of ideal members of each cluster.</strong>

In [None]:
#Creating an object to store the cluster centers
#centroids_survey_answers_kmeans_pca = survey_answers_kmeans_pca.cluster_centers_

#Converting cluster centers into a dataframe
#centroids_survey_answers_kmeans_pca_df = pd.DataFrame(centroids_survey_answers_kmeans_pca)

#Renaming the columns
#centroids_survey_answers_kmeans_pca_df.columns = pca7_survey_scaled_df.columns

#Checking the results
#centroids_survey_answers_kmeans_pca_df

In [None]:
#Concatenating the cluster and PCA components
#cluster_pca_df = pd.concat([survey_answers_kmeans_pca_df,
#                           pca7_survey_df],
#                           axis = 1)

#Checking the results
#cluster_pca_df

In [None]:
#Concatenating the earlier eliminated columns
#final_cluster_pca_df = pd.concat([original_df.loc[:, ['target','current_laptop', 'surveyID', 'program', 'age', 'gender', 'nationality', 'ethnicity']],
#                                 cluster_pca_df], 
#                                 axis = 1)

#Checking the results
#final_cluster_pca_df

<h4>Clustering using the chosen 7 Principal components of the survey answers big5 personality traits explanatory variables</h4>

In [None]:
#Checking the variance of the pca7_survey object
pd.np.var(pca7_survey_big5_df)

<strong>Rescaling our data, as the variance among features is no longer same in the new dataset.</strong>

In [None]:
#Instantiating a Standard Scaler object
scaler = StandardScaler()

#Fitting the scaler object to the dataset
scaler.fit(pca7_survey_big5_df)

#Transforming the fit data
pca7_survey_big5_scaled = scaler.transform(pca7_survey_big5_df)

#Converting it into a dataframe
pca7_survey_big5_scaled_df = pd.DataFrame(pca7_survey_big5_scaled)

#Renaming the columns
pca7_survey_big5_scaled_df.columns = factor_loadings_big5_df.columns

#Checking the variance
pd.np.var(pca7_survey_big5_scaled_df)

<br><br><strong>Creating a dendogram to determine the number of cluster for KNN modeling</strong>

In [None]:
#Grouping data based on Ward distance
standard_mergings_ward = linkage(y      = pca7_survey_big5_scaled_df,
                                 method = 'ward')


#Setting plot size
fig, ax = plt.subplots(figsize=(12, 12))

#Developing a dendrogram
dendrogram(Z = standard_mergings_ward,
           leaf_rotation  = 90,
           leaf_font_size = 6)

plt.show()

<strong>Plotting an inertia plot to see viable candidates for determining the number of clusters to be used in KMeans modeling</strong>

In [None]:
#Calling the inertia_plot function
inertia_plot(data = pca7_survey_big5_scaled_df)

<strong>Based on the inertia plot, lets use 3 clusters for building the KMeans model</strong>

In [None]:
#INSTANTIATING a k-Means object with 3 clusters
survey_answers_big5_kmeans_pca = KMeans(n_clusters   = 3,
                                        random_state = 222)


#Fitting the object to the data
survey_answers_big5_kmeans_pca.fit(pca7_survey_big5_scaled_df)


#Converting the clusters to a DataFrame
survey_answers_big5_kmeans_pca_df = pd.DataFrame({'Cluster': survey_answers_big5_kmeans_pca.labels_})


#Checking the results
print(survey_answers_big5_kmeans_pca_df.iloc[: , 0].value_counts())

<strong>Finding the centroids of each cluster, in an attempt to explain the underlying story of ideal members of each cluster.</strong>

In [None]:
#Creating an object to store the cluster centers
centroids_survey_answers_big5_kmeans_pca = survey_answers_big5_kmeans_pca.cluster_centers_

#Converting cluster centers into a dataframe
centroids_survey_answers_big5_kmeans_pca_df = pd.DataFrame(centroids_survey_answers_big5_kmeans_pca)

#Renaming the columns
centroids_survey_answers_big5_kmeans_pca_df.columns = pca7_survey_big5_scaled_df.columns

#Checking the results
centroids_survey_answers_big5_kmeans_pca_df

In [None]:
#Concatenating the cluster and PCA components
cluster_pca_big5_df = pd.concat([survey_answers_big5_kmeans_pca_df,
                                pca7_survey_big5_df],
                                axis = 1)

#Checking the results
cluster_pca_big5_df

In [None]:
#Concatenating the earlier eliminated columns
final_cluster_pca_big5_df = pd.concat([original_df.loc[:, ['preferred_laptop','current_laptop', 'surveyID', 'program', 'age', 'gender', 'nationality', 'ethnicity']],
                                      cluster_pca_big5_df], 
                                      axis = 1)

#Checking the results
final_cluster_pca_big5_df

In [None]:
#Gaining greater insights
strong_verbal_aptitude = final_cluster_pca_big5_df[['strong_verbal_aptitude', 'Cluster', 'current_laptop', 'preferred_laptop']][final_cluster_pca_big5_df['strong_verbal_aptitude']>1.0]

print(strong_verbal_aptitude)
print("\n\n")
print(strong_verbal_aptitude['current_laptop'].value_counts())
print("\n\n")
print(strong_verbal_aptitude['preferred_laptop'].value_counts())

<strong>INSIGHT:</strong><br>People belonging to the Principal component which says they have a strong verbal aptitude are most likely to own a Macbook as well as say they would prefer a Macbook over a PC.<br>This is also backed up with research conducted by  <a href="https://www.hongkiat.com/blog/mac-vs-pc-myth-busting-consumer-guide/">Nina Krimly</a> and research by <a href="https://www.huffpost.com/entry/mac-vs-pc-what-your-os-says-about-you_n_852170">Thomas Houston</a>

In [None]:
#Gaining greater insights
non_party_person = final_cluster_pca_big5_df[['non_party_person', 'Cluster', 'current_laptop', 'preferred_laptop']][final_cluster_pca_big5_df['non_party_person']>1.0]

print(non_party_person)
print("\n\n")
print(non_party_person['current_laptop'].value_counts())
print("\n\n")
print(non_party_person['preferred_laptop'].value_counts())

<strong>INSIGHT 3:</strong><br>People who don't like partying or even throwing a party, are likely to currently own a Windows laptop and probably would prefer a Windows Laptop. However the count of people not interested in parties and who prefer a Windows or a Mac is almost the same, if price for both is equal.<br>This is backed up by external research from <a href="https://www.hongkiat.com/blog/mac-vs-pc-myth-busting-consumer-guide/">Nina Krimly</a> and research by <a href="https://www.huffpost.com/entry/mac-vs-pc-what-your-os-says-about-you_n_852170">Thomas Houston</a>

<h4>Clustering using the chosen 4 Principal components of the survey answers hult dna explanatory variables</h4>

In [None]:
#Checking the variance of the pca7_survey object
pd.np.var(pca4_survey_hultdna_df)

<strong>Rescaling our data, as the variance among features is no longer same in the new dataset.</strong>

In [None]:
#Instantiating a Standard Scaler object
scaler = StandardScaler()

#Fitting the scaler object to the dataset
scaler.fit(pca4_survey_hultdna_df)

#Transforming the fit data
pca4_survey_hultdna_scaled = scaler.transform(pca4_survey_hultdna_df)

#Converting it into a dataframe
pca4_survey_hultdna_scaled_df = pd.DataFrame(pca4_survey_hultdna_scaled)

#Renaming the columns
pca4_survey_hultdna_scaled_df.columns = factor_loadings_hultdna_df.columns

#Checking the variance
pd.np.var(pca4_survey_hultdna_scaled_df)

<br><br><strong>Creating a dendogram to determine the number of cluster for KNN modeling</strong>

In [None]:
#Grouping data based on Ward distance
standard_mergings_ward = linkage(y = pca4_survey_hultdna_scaled_df,
                                 method = 'ward')


#Setting plot size
fig, ax = plt.subplots(figsize=(12, 12))

#Developing a dendrogram
dendrogram(Z = standard_mergings_ward,
           leaf_rotation  = 90,
           leaf_font_size = 6)

plt.show()

<strong>Plotting an inertia plot to see viable candidates for determining the number of clusters to be used in KMeans modeling</strong>

In [None]:
#Calling the inertia_plot function
inertia_plot(data = pca4_survey_hultdna_scaled_df)

<strong>Based on the inertia plot, lets use 4 clusters for building the KMeans model</strong>

In [None]:
#INSTANTIATING a k-Means object with five clusters
survey_answers_hultdna_kmeans_pca = KMeans(n_clusters   = 4,
                                           random_state = 222)


#Fitting the object to the data
survey_answers_hultdna_kmeans_pca.fit(pca4_survey_hultdna_scaled_df)


#Converting the clusters to a DataFrame
survey_answers_hultdna_kmeans_pca_df = pd.DataFrame({'Cluster': survey_answers_hultdna_kmeans_pca.labels_})


#Checking the results
print(survey_answers_hultdna_kmeans_pca_df.iloc[: , 0].value_counts())

<strong>Finding the centroids of each cluster, in an attempt to explain the underlying story of ideal members of each cluster.</strong>

In [None]:
#Creating an object to store the cluster centers
centroids_survey_answers_hultdna_kmeans_pca = survey_answers_hultdna_kmeans_pca.cluster_centers_

#Converting cluster centers into a dataframe
centroids_survey_answers_hultdna_kmeans_pca_df = pd.DataFrame(centroids_survey_answers_hultdna_kmeans_pca)

#Renaming the columns
centroids_survey_answers_hultdna_kmeans_pca_df.columns = pca4_survey_hultdna_scaled_df.columns

#Checking the results
centroids_survey_answers_hultdna_kmeans_pca_df

In [None]:
#Concatenating the cluster and PCA components
cluster_pca_hultdna_df = pd.concat([survey_answers_hultdna_kmeans_pca_df,
                                   pca4_survey_hultdna_df],
                                   axis = 1)

#Checking the results
cluster_pca_hultdna_df

In [None]:
#Concatenating the earlier eliminated columns
final_cluster_pca_hultdna_df = pd.concat([original_df.loc[:, ['preferred_laptop','current_laptop', 'surveyID', 'program', 'age', 'gender', 'nationality', 'ethnicity']],
                                         cluster_pca_hultdna_df], 
                                         axis = 1)

#Checking the results
final_cluster_pca_hultdna_df

In [None]:
#Gaining greater insights
growth_mindset = final_cluster_pca_hultdna_df[['growth_mindset', 'Cluster', 'current_laptop', 'preferred_laptop']][final_cluster_pca_hultdna_df['growth_mindset']>1.0]

print(growth_mindset)
print("\n\n")
print(growth_mindset['Cluster'].value_counts())
print("\n\n")
print(growth_mindset['current_laptop'].value_counts())
print("\n\n")
print(growth_mindset['preferred_laptop'].value_counts())

<strong>INSIGHT 2:</strong><br> People belonging to the Growth Mindset persona (Principal Component) and mostly in Cluster 1, currently own a Windows Laptop, as seen from the standard deviation value greater than 1. But given a choice of prices being equal for both Macbook and Windows, people in the Growth Mindset persona would marginally prefer Macbook over a Windows laptop. As such Microsoft should target students, who carry a growth mindset, as most likely they would prefer currently owning a Windows laptop and also preferably stick with it in the future.<br><br><br>

<br><br><br>
***
***
***
<h3>Boxplots</h3>

***
***
***

<h4>Creating boxplots of the Principal components for the survey answers regarding Hult DNA, with regards to the current laptop that they posses</h4>

In [None]:
#not_so_bright_kid
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'not_so_bright',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

In [None]:
#teamplayer
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'growth_mindset',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

In [None]:
#unconvincing
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'unconvincing',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

In [None]:
#talented_but_not_a_teamplayer
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'inefficient',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

<h4>Creating boxplots of the Principal components for the survey answers regarding Hult DNA, separated by future laptop choice</h4>

In [None]:
#talented_but_not_a_teamplayer
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'not_so_bright',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

In [None]:
#teamplayer
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'growth_mindset',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

<strong>INSIGHT:</strong><br>For the "teamplayer" principal component, there is a big difference in mean between the surveyors who want to buy Chromebook, compared to the ones who wanna buy Macbook/Windows laptop, when asked this question. Implying, people belonging to this cluster as well as this particular principal component, have a desire to buy chromebook more than any other laptop.<br><br><br>

In [None]:
#unconvincing
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'unconvincing',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

<strong>INSIGHT:</strong><br>For the "unconvincing" principal component, for clusters 1 and 2, there tend to be fewer Chromebook buyers than Macbook/Windows, as seen by the lower mean line.<br><br><br>

In [None]:
#talented_but_not_a_teamplayer
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'inefficient',
#            hue  = 'Cluster',
#            data = final_cluster_pca_hultdna_df)


#plt.tight_layout()
#plt.show()

<strong>INSIGHT:</strong><br>For "talented_but_not_a_teamplayer" principal component, people belonging to cluster 1 tend to show a greater desire to buy Chromebook, than Macbook/Windows, as seen by the higher mean.<br><br><br>

<h4>Creating boxplots of the Principal components for the big5 personality traits survey answers, separated by current laptop.</h4>

In [None]:
#uninterested
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'prefer_isolation',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#moody
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'moody',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#introverts
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'non_party_person',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#predominant
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'strong_verbal_aptitude',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#creative_but_messy
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'relaxed',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#systematic
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'systematic',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#respectful
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'current_laptop',
#            y    = 'creative',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

<h4>Creating boxplots of the Principal components for the big5 personality traits survey answers, separated by future laptop choice</h4>

In [None]:
#uninterested
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'prefer_isolation',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#moody
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'moody',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

<strong>INSIGHT:</strong><br>People belonging to the "moody" principal component and cluster 0, show a big variance, however prefer to buy Chromebook.<br><br><br>

In [None]:
#introverts
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'non_party_person',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

<strong>INSIGHT:</strong><br>People belonging to the "introverts" component and to clusters 0, 1, do not prefer to buy the Chromebook and would like to buy the Windows or the Macbook.<br><br><br>

In [None]:
#predominant
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'strong_verbal_aptitude',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#creative_but_messy
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'relaxed',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#systematic
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'systematic',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()

In [None]:
#respectful
#fig, ax = plt.subplots(figsize = (12, 8))
#sns.boxplot(x    = 'preferred_laptop',
#            y    = 'creative',
#            hue  = 'Cluster',
#            data = final_cluster_pca_big5_df)


#plt.tight_layout()
#plt.show()