# PersonalityGroup

## Importing and setting up the Dataset

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.graph_objs as go
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import *
from sklearn.cluster import DBSCAN

In [None]:
traits_survey_complete = pd.read_csv('C:/Users/aleja/Downloads/516764_951745_bundle_archive/IPIP-FFM-data-8Nov2018/data-final.csv', sep='\t')
traits_survey = traits_survey_complete.sample(80000)
traits_survey.head()

In [None]:
traits_survey = traits_survey.reset_index()
traits_survey.drop(columns=['index'], inplace=True)
print(traits_survey.shape)
traits_survey.head()

## Exploratory Data Analysis

In [None]:
traits_survey.dtypes

In [None]:
number_traits_questions = traits_survey.columns.get_loc('OPN10')
number_traits_questions

In [None]:
pers_traits_questions = traits_survey.columns[: number_traits_questions + 1]
pers_traits_questions

In [None]:
for col in pers_traits_questions:
    print(traits_survey[col].value_counts())

The first 50 columns of traits_survey corresponds to the questions made according to the Big Five Personality Traits, so their values (responses) are 0., 1., 2., 3., 4. or 5., given that the answers were asked for in that scale.

In [None]:
traits_questions_data = traits_survey[pers_traits_questions]
traits_questions_data.describe()

The following 50 columns correspond to the time lapses it took for each person to answer.

In [None]:
times_per_questions = traits_survey[traits_survey.columns[
number_traits_questions + 1: traits_survey.columns.get_loc('OPN10_E') + 1]]
times_per_questions

In [None]:
times_per_questions.describe()

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=5, figsize = (17, 6))
sns.distplot(traits_survey['EXT1'], ax=ax[0])
sns.distplot(traits_survey['EST1'],ax=ax[1])
sns.distplot(traits_survey['AGR1'], ax=ax[2])
sns.distplot(traits_survey['CSN1'], ax=ax[3])
sns.distplot(traits_survey['OPN1'], ax=ax[4])
plt.show()

In [None]:
times_per_questions['EXT1_E'].value_counts()

In [None]:
int_data = np.array(np.where(traits_survey.dtypes == int)).flatten()
traits_survey.iloc[:,int_data]

In [None]:
traits_survey['endelapse'] = traits_survey['endelapse'].astype('float')
traits_survey['IPC'] = traits_survey['IPC'].astype('float')

In [None]:
'''numeric_data = np.array(np.where(traits_survey.dtypes == float)).flatten()
numeric_data'''

In [None]:
f_cols = []
for i in range(traits_survey.shape[1]):
    if pd.DataFrame(traits_survey.dtypes).reset_index()[0][i] == 'float':
        f_cols.append(pd.DataFrame(traits_survey.dtypes).reset_index()['index'][i])
f_cols

In [None]:
numeric_data = traits_survey[f_cols]
numeric_data.corr()

In [None]:
plt.figure(figsize=(30.0, 30.0))
sns.heatmap(numeric_data.corr(), annot=True)

In [None]:
traits_survey.columns[traits_survey.columns.get_loc('OPN10_E') + 1:]

In [None]:
traits_survey['dateload'].value_counts()

In [None]:
# dateload column will be dropped due to the timestamp does not represent a trivial feature for the outcome

In [None]:
traits_survey.drop(columns=['dateload'], inplace=True)

In [None]:
print(len(traits_survey['screenw'].value_counts()))
sns.boxplot(traits_survey['screenw'], whis=3.5)

In [None]:
# screenw and screenh will be dropped as well due to they are the width and height of the screen in pixels
# respectively where the survey was done

traits_survey.drop(columns=['screenw', 'screenh'], inplace=True)

In [None]:
'''ctgr_cols_ind = np.where(traits_survey.dtypes != float)
traits_survey.columns[ctgr_cols_ind[0][0] : ctgr_cols_ind[0][0] + 3]'''

In [None]:
cols_list = []
for ind in np.where(traits_survey.dtypes != float):
    cols_list.append(traits_survey.columns[ind])
ctgr_cols = cols_list[0]
ctgr_cols

In [None]:
print(traits_survey.country.value_counts())
sns.boxplot(traits_survey['country'])

In [None]:
pd.DataFrame(traits_survey.isnull().sum()).reset_index()['index']

In [None]:
traits_survey.dropna(inplace=True)
traits_survey.shape

## Data Preprocessing

In [None]:
for ind in np.where(traits_survey.dtypes == object):
  
  print(traits_survey.columns[ind])

In [None]:
ctgr_data = traits_survey[ctgr_cols]
ctgr_data

In [None]:
dummy_ctgr_data = pd.get_dummies(ctgr_data)
dummy_ctgr_data

In [None]:
stand_data = pd.DataFrame(preprocessing.scale(numeric_data))
stand_data

In [None]:
X = pd.concat([stand_data, ctgr_data], axis=1)
X.head()

## K-Means Clustering

In [None]:
model = KMeans(n_clusters=2, init='random')
clusters = model.fit(X)

In [None]:
X.head()

In [None]:
cluster_numbers = clusters.predict(X)
cluster_numbers

In [None]:
traits_survey['Cluster'] = cluster_numbers
traits_survey['Cluster'].value_counts()

In [None]:
traits_survey[traits_survey['Cluster'] == 1].head()

In [None]:
def clust_scatterplot(df, x, y, hue):
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=df, x=x, y=y, hue=hue, style=hue)

In [None]:
# An average of each set of questions will be done for 
# representing every 'Big Five Personality Trait'


def recurs(trait, n=10):
  if n == 1:
    return traits_survey[trait + '1']
  else:
    return traits_survey[trait + str(n)] + recurs(trait, n-1)


for trait in ['EXT', 'EST', 'AGR', 'CSN', 'OPN']:
    traits_survey[trait + '_m'] = recurs(trait)/10

In [None]:
clust_scatterplot(traits_survey, 'EXT_m', 'OPN_m', cluster_numbers)

### Determining optimal number of clusters

In [None]:
X1 = pd.concat([stand_data, ctgr_data], axis=1)
clusters1 = KMeans(n_clusters=2, n_init=10, init="random").fit(X1)

In [None]:
clusters1.inertia_

In [None]:
results = []
for i in range(1,10):
    Xn = pd.concat([stand_data, ctgr_data], axis=1)
    clustersn = KMeans(n_clusters=i, n_init=10, init="random").fit(Xn)
    results.append({"n": i, "inertia": clustersn.inertia_})
elbow_plot = pd.DataFrame(results)

In [None]:
elbow_plot

In [None]:
results

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(data=elbow_plot, x="n", y="inertia")
plt.title("Optimal Number of Clusters")

In [None]:
def plot_elbow(df, dims):
    results = []
    for i in range(1,10):
        Xn = df[dims]
        clustersn = KMeans(n_clusters=i, n_init=10, init="random").fit(Xn)
        results.append({"n": i+1, "inertia": clustersn.inertia_})
    elbow_plot = pd.DataFrame(results)
    
    plt.figure(figsize=(12,8))
    sns.lineplot(data=elbow_plot, x="n", y="inertia")
    plt.title("Optimal Number of Clusters")

In [None]:
plot_elbow(df, ['EXT_m', 'EST_m', 'CSN_m', 'AGR_m', 'OPN_m'])

In [None]:
def cluster_2d(df, x, y, n_clusters):
    X_n = df[[x, y]]
    clusters = KMeans(n_clusters=n_clusters, n_init=10, init="random").fit(X_n)
    clust_scatterplot(df, x, y, clusters.predict(X_n))

In [None]:
cluster_2d(traits_survey, 'EXT_m', 'OPN_m', 4)