# Data Cleaning for the Personality - Hassles model

3 Datasets:
1. 'Base' - Base dataset for personality and hassles
2. 'Categorical' - Separated on the category of hassles (each file contains 1 category of hassle)
3. 'Individual' - Separated on each hassle (each file contains 1 hassle)

## Section 1: Import Librares and Reading the Dataset

In [136]:
import numpy as np
import pandas as pd
from pathlib import Path

In [137]:
personality_df = pd.read_csv('data/source/personality.csv')
hassles_df = pd.read_csv('data/source/hassles.csv')

In [138]:
display(personality_df.head())
display(hassles_df.head())

Unnamed: 0,User,Age,Gender,O,C,E,A,N
0,ruzel_khyvin_te@dlsu.edu.ph,21,Male,31,27,17,33,24
1,stacy_selena_kalaw@dlsu.edu.ph,21,Female,40,27,20,18,27
2,kailey_sy@dlsu.edu.ph,19,Female,30,21,19,32,24
3,audrey_salamat@dlsu.edu.ph,20,Female,34,25,13,26,26
4,jhersey_elyza_cheng@dlsu.edu.ph,19,Female,39,17,22,25,36


Unnamed: 0,User,Misplacing or losing things,Silly practical mistakes,Trouble with pets,Difficulties with friends,Regrets over past decision/s,Concerned about the meaning of life,Being lonely,Inability to express oneself,Fear of rejection,...,Side effects of medication,Concerns about health in general,Concerns about bodily functions,Dissatisfaction with academic performance,Challenges with instructors,Discontent with current academic responsibilities,Concerns regarding academic transitions,Difficulties with peers or classmates,Challenges in managing group projects,Getting late to class
0,ruzel_khyvin_te@dlsu.edu.ph,2,1,3,4,3,4,2,3,4,...,1,3,3,3,2,4,2,3,3,1
1,stacy_selena_kalaw@dlsu.edu.ph,3,2,1,2,2,1,2,1,2,...,1,2,1,1,1,1,1,1,1,1
2,kailey_sy@dlsu.edu.ph,3,3,3,4,4,2,2,2,5,...,4,4,4,4,4,4,3,3,3,3
3,audrey_salamat@dlsu.edu.ph,2,1,1,4,3,4,2,3,4,...,2,2,2,2,3,3,3,2,3,3
4,hannah_regine_fong@dlsu.edu.ph,4,3,4,5,5,5,5,4,5,...,4,4,4,5,5,5,3,5,5,4


## Section 2: Data Cleaning

In [139]:
hassles_df = pd.melt(hassles_df, id_vars = ['User'], var_name = 'Hassle', value_name = 'Severity')

In [140]:
hassles_df.head()

Unnamed: 0,User,Hassle,Severity
0,ruzel_khyvin_te@dlsu.edu.ph,Misplacing or losing things,2
1,stacy_selena_kalaw@dlsu.edu.ph,Misplacing or losing things,3
2,kailey_sy@dlsu.edu.ph,Misplacing or losing things,3
3,audrey_salamat@dlsu.edu.ph,Misplacing or losing things,2
4,hannah_regine_fong@dlsu.edu.ph,Misplacing or losing things,4


In [141]:
df = pd.merge(personality_df, hassles_df, on = 'User', how = 'inner')

In [142]:
df.drop('User', axis = 1, inplace =  True)

In [143]:
df.head()

Unnamed: 0,Age,Gender,O,C,E,A,N,Hassle,Severity
0,21,Male,31,27,17,33,24,Misplacing or losing things,2
1,21,Male,31,27,17,33,24,Silly practical mistakes,1
2,21,Male,31,27,17,33,24,Trouble with pets,3
3,21,Male,31,27,17,33,24,Difficulties with friends,4
4,21,Male,31,27,17,33,24,Regrets over past decision/s,3


### 2.1: Base Dataset

In [144]:
df.to_csv('data/base/personality_hassles.csv')

### 2.2: Categorical Dataset

In [145]:
directory = 'data/category'

In [146]:
categories = {
    'General_hassles': ['Misplacing or losing things', 'Silly practical mistakes', 'Trouble with pets', 'Difficulties with friends'],
    'Inner_concerns': ['Regrets over past decision/s', 'Concerned about the meaning of life', 'Being lonely', 'Inability to express oneself', 'Fear of rejection', 'Trouble making decisions', 'Physical appearance', 'Not seeing people', "Troubling thoughts about ones future", 'Not enough personal energy', 'Concerns about getting ahead', 'Fear of confrontation', 'Wasting time'],
    'Financial_concerns': ['Not enough money for basic necessities (food, clothing, transportation, housing, healthcare etc.)', 'Not enough money for wants (entertainment and recreation)', 'Concerns about owing money', 'Concerns about money for emergencies', 'Financial security'],
    'Time_Pressures': ['Not enough time to do things one needs to', 'Too many responsibilities', 'Not getting enough rest', 'Too many interruptions', 'Not enough time for entertainment and recreation', 'Too many meetings', 'Social obligations', 'Concerns about meeting high standards', 'Noise'],
    'Environmental_Hassles': ['Pollution', 'Crime', 'Traffic', 'Concerns about news events', 'Rising prices of common goods', 'Concerns about accidents'],
    'Family_Hassles': ['Yardwork or outside home maintenance', 'Overloaded with family responsibilities', 'Home maintenance (inside)'],
    'Health_Hassles': ['Concerns about medical treatment', 'Physical illness', 'Side effects of medication', 'Concerns about health in general', 'Concerns about bodily functions'],
    'Academic_Hassles': ['Dissatisfaction with academic performance', 'Challenges with instructors', 'Discontent with current academic responsibilities', 'Concerns regarding academic transitions', 'Difficulties with peers or classmates', 'Challenges in managing group projects', 'Getting late to class']
}


In [147]:
for category, hassles in categories.items():
    filtered_df = df[df['Hassle'].isin(hassles)]
    filename = f'{category}.csv'
    filepath = Path(directory) / filename
    filtered_df.to_csv(filepath, index=False)

### 2.3 Individual Dataset

In [148]:
directory = 'data/individual'

In [149]:
def split_dataset_by_hassle(data):
    grouped = data.groupby('Hassle')
    i = 1;
    
    for hassle, group in grouped:
        filename = f'{i}.csv'
        group.drop('Hassle', axis = 1, inplace = True)
        filepath = Path(directory) / filename
        group.to_csv(filepath, index = False)
        i+= 1

In [150]:
split_dataset_by_hassle(df)

In [151]:
# from sklearn.preprocessing import LabelEncoder
# df['Gender'] = LabelEncoder().fit_transform(df['Gender'])
# df['Hassle'] = LabelEncoder().fit_transform(df['Hassle'])
# from sklearn.preprocessing import MinMaxScaler
# df['O'] = MinMaxScaler(feature_range = (10, 50)).fit_transform(df[['O']])
# df['C'] = MinMaxScaler(feature_range = (9, 45)).fit_transform(df[['C']])
# df['E'] = MinMaxScaler(feature_range = (8, 40)).fit_transform(df[['E']])
# df['A'] = MinMaxScaler(feature_range = (9, 45)).fit_transform(df[['A']])
# df['N'] = MinMaxScaler(feature_range = (8, 40)).fit_transform(df[['N']])