# Personality - Categorical Hassles notebook


## Section 1: Reading the dataset and data preparation

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scaler import CustomScaler


In [13]:
filenames = ['General_hassles', 'Inner_concerns', 'Financial_concerns', 'Time_Pressures']
extension = '.csv'
dfs = {}
test_dfs = {}
for filename in filenames:
    df = pd.read_csv(f'../data/category/{filename}.csv')
    df = df.sample(frac = 1, random_state = 42)

    df['Hassle'] = LabelEncoder().fit_transform(df['Hassle'])

    # scaler_O = MinMaxScaler(feature_range=(10, 50))
    # scaler_C = MinMaxScaler(feature_range=(9, 45))
    # scaler_E = MinMaxScaler(feature_range=(8, 40))
    # scaler_A = MinMaxScaler(feature_range=(9, 45))
    # scaler_N = MinMaxScaler(feature_range=(8, 40))

    # # Fit and transform specific columns
    # df['O'] = scaler_O.fit_transform(df[['O']])
    # df['C'] = scaler_C.fit_transform(df[['C']])
    # df['E'] = scaler_E.fit_transform(df[['E']])
    # df['A'] = scaler_A.fit_transform(df[['A']])
    # df['N'] = scaler_N.fit_transform(df[['N']])
    
    #drop age and gender for now
    df.drop(columns = ['Age', 'Gender'], inplace = True)

    
    dfs[filename] = df

## Section 2: Column Transformer

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
ct = ColumnTransformer(
    [
        ('oScaler', CustomScaler(data_range = [10, 50]), ['O']), 
        ('cScaler', CustomScaler(data_range = [9, 45]), ['C']), 
        ('eScaler', CustomScaler(data_range = [8, 40]), ['E']), 
        ('aScaler', CustomScaler(data_range = [9, 45]), ['A']), 
        ('nScaler', CustomScaler(data_range = [8, 40]), ['N']), 
    ]
)

## Section 3: Model and Hyperparameters Initialization

### 3.1 Model Initialization

In [16]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor


In [17]:
linreg = LinearRegression()
ridge = Ridge(max_iter = -1)
lasso = Lasso(max_iter = -1)
elasticnet = ElasticNet()

svr = SVR(verbose = 0, max_iter = -1)
nusvr = NuSVR(verbose = 0, max_iter = -1)
lsvr = LinearSVR(verbose = 0, max_iter = -1)

dt = DecisionTreeRegressor()

rfr = RandomForestRegressor( random_state = 42)
gbr = GradientBoostingRegressor( random_state = 42)
vr = VotingRegressor(estimators = RandomForestRegressor(random_state = 42))
br = BaggingRegressor( random_state = 42)
abr = AdaBoostRegressor(random_state = 42)

nn = MLPRegressor(random_state = 42, verbose = 0)


### 3.2 Pipleline initialization

In [18]:
from sklearn.pipeline import Pipeline

In [19]:
pipe = Pipeline([('preprocessor', ct), ('regressor', nn)])

### 3.3 Hyperparams Initialization

In [20]:
hyperparameters = [
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor': [ridge]
    },
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor__selection': ['cyclic', 'random'],
        'regressor': [lasso]
    },
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__l1_ratio': [0.5, 0.05, 0.005],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor__selection': ['cyclic', 'random'],
        'regressor': [elasticnet]
    },
    # {
    #     'regressor__kernel': ['rbf', 'linear', 'poly'],
    #     'regressor__degree': [3, 7, 11],
    #     'regressor__C': [1.0, 1.5, 0.5],
    #     'regressor__epsilon': [0.1, 0.01, 0.001],
    #     'regressor': [svr]
    # },
    # {
    #     'regressor__kernel': ['rbf', 'linear', 'poly'],
    #     'regressor__degree': [3, 7, 11],
    #     'regressor__C': [1.0, 1.5, 0.5],
    #     'regressor__nu': [0.01, 0.5, 1],
    #     'regressor': [nusvr]
    # },
    # {
    #     'regressor__epsilon': [0.0, 0.5, 0.05],
    #     'regressor__C': [1.0, 0.5, 1.5],
    #     'regressor__tol': [0.0001, 0.00001, 0.001],
    #     'regressor__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
    #     'regressor': [lsvr]
    # },
    {
        'regressor__criterion': ['squared_error', 'poisson', 'friedman_mse', 'absolute_error'],
        'regressor__splitter': ['best', 'random'],
        'regressor__max_depth': [4, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor': [dt]
    },
    {
        'regressor__n_estimators': [15, 25, 45, 75, 100, 125],
        'regressor__criterion': ['mse', 'mae', 'friedman_mse', 'poisson'],
        'regressor__max_depth': [4, 5, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor__bootstrap': [True, False],
        'regressor': [rfr]
    },
    {
        'regressor__n_estimators': [75, 100, 125],
        'regressor__max_depth': [4, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor__learning_rate': [1.0, 0.1, 0.01],
        'regressor__loss': ['ls', 'lad', 'huber', 'quantile'],
        'regressor': [gbr]
    },
    {
        'regressor__base_estimator': [DecisionTreeRegressor(random_state = 42), RandomForestRegressor(random_state = 42,  n_estimators = 20) ],
        'regressor__n_estimators': [10, 20, 5],
        'regressor__bootstrap': [True, False],
        'regressor': [br]
    },
    {
        'regressor__base_estimator': [DecisionTreeRegressor(random_state = 42, max_depth = 10), RandomForestRegressor(random_state = 42, n_estimators = 20) ],
        'regressor__n_estimators': [25, 50, 75],
        'regressor__learning_rate': [1.0, 0.1, 0.05],
        'regressor__loss': ['linear', 'square', 'exponential'],
        'regressor': [abr]
    },
    {
        'regressor__hidden_layer_sizes' : [(20, 20, 20, 20, 20), (15, 15, 15, 15, 15, 15, 15), (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)],
        'regressor__alpha': [0.0001, 0.001, 0.00001],
        'regressor__learning_rate_init' : [0.0001, 0.001, 0.01],
        'regressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'regressor__solver' : ['adam', 'sgd', 'lbfgs'],
        'regressor__activation' : ['relu', 'tanh'],
        'regressor': [nn]
    },

]


## Section 4: Model Training and Prediction

PIPELINE: Iterate through each df -> initialize a grid search -> train models -> get predictions -> save best settings and model -> display hassle, model, r2 score

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pathlib import Path
import joblib
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

In [22]:
# Iterate through each df
for df_name, df in dfs.items():
    
    # Ignore warnings
    import warnings
    warnings.filterwarnings("ignore")

    # Separate df to X and y
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    

    # Perform train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Iterate through each model and the hyperparams
    grid = GridSearchCV(pipe, hyperparameters)
    grid.fit(X_train, y_train)
    display(df.head())
    grid.best_params_

    # Get best model and hyperparams
    best_model = grid.best_estimator_
    best_params = grid.best_params_

    # Get best model name
    name = type(best_model).__name__

    # Define directory where model and hyperparams will be saved
    hyperparams_directory = Path('../models/specs/')
    model_directory = Path('../models/model/')

    
    # Generate filename
    hyperparams_file = hyperparams_directory / f"{df_name}_{name}_hyperparams.pkl"
    model_file =model_directory / f"{df_name}_{name}_model.pkl"

    # Save model
    with open(model_file, 'wb') as f:
        joblib.dump(best_model, f)

    # Save hyperparams
    with open(hyperparams_file, 'wb') as f:
        joblib.dump(best_params, f)

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Get r2 score
    r2 = r2_score(y_test, y_pred)

    # Print r2 score
    print(df_name, " ", best_model, " R2 score: ", r2)

Unnamed: 0,O,C,E,A,N,Hassle,Severity
40,31,26,21,26,30,1,3
65,48,27,23,33,28,2,4
4,40,27,20,18,27,1,3
47,28,32,28,34,20,0,3
42,31,26,21,26,30,3,1


General_hassles   Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('oScaler',
                                                  CustomScaler(data_range=[10,
                                                                           50]),
                                                  ['O']),
                                                 ('cScaler',
                                                  CustomScaler(data_range=[9,
                                                                           45]),
                                                  ['C']),
                                                 ('eScaler',
                                                  CustomScaler(data_range=[8,
                                                                           40]),
                                                  ['E']),
                                                 ('aScaler',
                                                  Cus

Unnamed: 0,O,C,E,A,N,Hassle,Severity
72,40,26,23,25,28,9,5
110,38,25,24,36,19,0,1
298,39,31,28,28,27,3,5
108,38,25,24,36,19,9,1
277,35,30,21,31,26,1,3


Inner_concerns   Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('oScaler',
                                                  CustomScaler(data_range=[10,
                                                                           50]),
                                                  ['O']),
                                                 ('cScaler',
                                                  CustomScaler(data_range=[9,
                                                                           45]),
                                                  ['C']),
                                                 ('eScaler',
                                                  CustomScaler(data_range=[8,
                                                                           40]),
                                                  ['E']),
                                                 ('aScaler',
                                                  Cust

Unnamed: 0,O,C,E,A,N,Hassle,Severity
108,33,29,31,29,30,0,5
67,43,37,29,27,30,1,5
31,40,26,23,25,28,4,2
119,35,30,21,31,26,2,3
42,43,39,21,31,25,1,1


Financial_concerns   Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('oScaler',
                                                  CustomScaler(data_range=[10,
                                                                           50]),
                                                  ['O']),
                                                 ('cScaler',
                                                  CustomScaler(data_range=[9,
                                                                           45]),
                                                  ['C']),
                                                 ('eScaler',
                                                  CustomScaler(data_range=[8,
                                                                           40]),
                                                  ['E']),
                                                 ('aScaler',
                                                  

Unnamed: 0,O,C,E,A,N,Hassle,Severity
165,33,25,20,28,19,6,2
6,31,27,17,33,24,5,2
111,42,28,20,39,19,6,3
172,32,31,27,35,23,8,2
115,42,28,20,39,19,0,3


Time_Pressures   Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('oScaler',
                                                  CustomScaler(data_range=[10,
                                                                           50]),
                                                  ['O']),
                                                 ('cScaler',
                                                  CustomScaler(data_range=[9,
                                                                           45]),
                                                  ['C']),
                                                 ('eScaler',
                                                  CustomScaler(data_range=[8,
                                                                           40]),
                                                  ['E']),
                                                 ('aScaler',
                                                  Cust