# Ablation: Personality Hassles notebook


## Section 1: Reading the dataset and data preparation

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scaler import CustomScaler

In [2]:
# filenames = ['General_hassles', 'Inner_concerns', 'Financial_concerns', 'Time_Pressures']
# filenames = ['Inner_concerns', 'Financial_concerns', 'Time_Pressures']
# filenames = ['General_hassles', 'Inner_concerns']
filenames = ['Financial_concerns', 'Time_Pressures']
extension = '.csv'
dfs = {}
test_dfs = {}
for filename in filenames:
    df = pd.read_csv(f'../data/category/{filename}.csv')
    df = df.sample(frac = 1, random_state = 42)

    df['Hassle'] = LabelEncoder().fit_transform(df['Hassle'])

    # scaler_O = MinMaxScaler(feature_range=(10, 50))
    # scaler_C = MinMaxScaler(feature_range=(9, 45))
    # scaler_E = MinMaxScaler(feature_range=(8, 40))
    # scaler_A = MinMaxScaler(feature_range=(9, 45))
    # scaler_N = MinMaxScaler(feature_range=(8, 40))

    # # Fit and transform specific columns
    # df['O'] = scaler_O.fit_transform(df[['O']])
    # df['C'] = scaler_C.fit_transform(df[['C']])
    # df['E'] = scaler_E.fit_transform(df[['E']])
    # df['A'] = scaler_A.fit_transform(df[['A']])
    # df['N'] = scaler_N.fit_transform(df[['N']])
    
    #drop age and gender for now
    df.drop(columns = ['Age', 'Gender'], inplace = True)

    dfs[filename] = df

## Section 2: Column Transformer

In [3]:
from sklearn.compose import ColumnTransformer

### 2.1 Scaler

In [4]:
ct1 = ColumnTransformer(
    [
        ('oScaler', CustomScaler(data_range = [10, 50]), ['O']),
        ('cScaler', CustomScaler(data_range = [9, 45]), ['C']),
        ('eScaler', CustomScaler(data_range = [8, 40]), ['E']),
        ('aScaler', CustomScaler(data_range = [9, 45]), ['A']),
        ('nScaler', CustomScaler(data_range = [8, 40]), ['N']),
    ],
)

In [5]:
# This function dynamically creates a columnt ransformer object and sets in on the piepline

# Takes in the array of columns and pipe
def create_column_transformer(cols, pipe):
    scaler_list = []
    for col in cols:
        if(col == 'O'):
            scaler = ('oScaler', CustomScaler(data_range = [10, 50]), ['O'])
        elif(col == 'C'):
            scaler = ('cScaler', CustomScaler(data_range = [9, 45]), ['C'])
        elif(col == 'E'):
            scaler = ('eScaler', CustomScaler(data_range = [8, 40]), ['E'])
        elif(col == 'A'):
            scaler = ('aScaler', CustomScaler(data_range = [9, 45]), ['A'])
        elif(col == 'N'):
            scaler = ('nScaler', CustomScaler(data_range = [8, 40]), ['N'])
        else:
            return ValueError;
        scaler_list.append(scaler)
    
    ct = ColumnTransformer(transformers = scaler_list)

    pipe.steps[0] = ('transformer', ct)

### 2.2 Encoder

In [6]:
ct2 = ColumnTransformer(
    [
        ('encoder', LabelEncoder(), ['Hassle'])
    ]
)

## Section 3: Model and Hyperparameters Initialization

### 3.1 Model Initialization

In [7]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor


In [8]:
linreg = LinearRegression()
ridge = Ridge(random_state=42)
lasso = Lasso(random_state=42)
elasticnet = ElasticNet(random_state=42)

svr = SVR(verbose = 0)
nusvr = NuSVR(verbose = 0)
lsvr = LinearSVR(verbose = 0)

dt = DecisionTreeRegressor(random_state=42)

rfr = RandomForestRegressor( random_state = 42)
gbr = GradientBoostingRegressor( random_state = 42)
vr = VotingRegressor(estimators = RandomForestRegressor(random_state = 42))
br = BaggingRegressor( random_state = 42)
abr = AdaBoostRegressor(random_state = 42)

nn = MLPRegressor(random_state = 42, verbose = 0)


### 3.2 Pipleline initialization

In [9]:
from sklearn.pipeline import Pipeline

In [10]:
pipe = Pipeline([('preprocessor1', ct1), ('regressor', linreg)])

### 3.3 Hyperparams Initialization

In [11]:
hyperparameters = [
    {
        'regressor': [linreg]
    },
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor': [ridge]
    },
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor__selection': ['cyclic', 'random'],
        'regressor': [lasso]
    },
    {
        'regressor__alpha': [0, 0.5, 1, 2],
        'regressor__l1_ratio': [0.5, 0.05, 0.005],
        'regressor__tol': [0.0001, 0.00001, 0.001],
        'regressor__selection': ['cyclic', 'random'],
        'regressor': [elasticnet]
    },
    {
        'regressor__criterion': ['squared_error', 'poisson', 'friedman_mse', 'absolute_error'],
        'regressor__splitter': ['best'],
        'regressor__max_depth': [4, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor': [dt]
    },
    {
        'regressor__n_estimators': [10, 15, 20, 25, 30],
        'regressor__criterion': ['mse', 'mae', 'friedman_mse', 'poisson'],
        'regressor__max_depth': [4, 5, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor__bootstrap': [True, False],
        'regressor': [rfr]
    },
    {
        'regressor__n_estimators': [10, 15, 20, 25, 30],
        'regressor__max_depth': [4, 8, 12],
        'regressor__min_samples_split': [4, 8, 12],
        'regressor__learning_rate': [1.0, 0.1, 0.01],
        'regressor__loss': ['ls', 'lad', 'huber', 'quantile'],
        'regressor': [gbr]
    },
    {
        'regressor__base_estimator': [
            DecisionTreeRegressor(random_state = 42, max_depth = 8), 
            DecisionTreeRegressor(random_state = 42, max_depth = 12), 
            RandomForestRegressor(random_state = 42,  n_estimators = 8),
            RandomForestRegressor(random_state = 42,  n_estimators = 12),
            ],
        'regressor__n_estimators': [10, 20, 5],
        'regressor__bootstrap': [True, False],
        'regressor': [br]
    },
    {
        'regressor__base_estimator': [
            DecisionTreeRegressor(random_state = 42, max_depth = 8), 
            DecisionTreeRegressor(random_state = 42, max_depth = 12), 
            RandomForestRegressor(random_state = 42, n_estimators = 8), 
            RandomForestRegressor(random_state = 42, n_estimators = 12), 
            ],
        'regressor__n_estimators': [25, 50, 75],
        'regressor__learning_rate': [1.0, 0.1, 0.05],
        'regressor__loss': ['linear', 'square', 'exponential'],
        'regressor': [abr]
    },
    {
        'regressor__hidden_layer_sizes' : [(20, 20, 20, 20, 20), (15, 15, 15, 15, 15, 15, 15), (10, 10, 10, 10, 10, 10, 10, 10, 10, 10)],
        'regressor__alpha': [0.0001, 0.001, 0.00001],
        'regressor__learning_rate_init' : [0.0001, 0.001, 0.01],
        'regressor__learning_rate': ['constant', 'invscaling', 'adaptive'],
        'regressor__solver' : ['adam', 'sgd', 'lbfgs'],
        'regressor__activation' : ['relu', 'tanh'],
        'regressor': [nn]
    },

]


## Section 4: Model Training and Prediction

PIPELINE: Iterate through each df -> Iterate through each possible combination of OCEAN -> initialize a grid search -> train models -> get predictions -> store subset, r2 scores and best model in an array -> after each iteration of df -> get the highest r2 value in the array

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from pathlib import Path
import joblib
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
import itertools

In [319]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [320]:
for df_name, df in dfs.items():
    # Trying ablation study for General hassles
    feature_columns = df.columns[:-2]

    # Initialize a subset, r2 scores and models array
    subsets = []
    r2scores = []
    models = []
    params = []


    # Iterate through each possible subset
    for L in range(1, len(feature_columns) + 1):
        for subset in itertools.combinations(feature_columns, L):
            selected_features = list(subset)
            print("Current selected features: ", selected_features)
            # Initialize X and y
            X = df[selected_features]
            X['Hassle'] = df['Hassle']
            y = df.iloc[:, -1]

            # # Train test split
            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


            # Change the columns in column transformer
            create_column_transformer(subset, pipe)

            # Iterate through each model and the hyperparams
            grid = GridSearchCV(pipe, hyperparameters, n_jobs= -1, scoring = 'r2')
            grid.fit(X, y)
            # grid.best_params_

            # Get best model and hyperparams
            best_model = grid.best_estimator_
            best_params = grid.best_params_
            best_r2 = grid.best_score_

            # # Make predictions
            # y_pred = best_model.predict(X_test)

            # # Get r2 score
            # r2 = r2_score(y_test, y_pred)

            #Save subsets, r2score, params and model
            subsets.append(subset)
            r2scores.append(best_r2)
            params.append(best_params)
            models.append(best_model)

    # Get the index of the highest r2 score on the current dataset
    highest_r2_index = r2scores.index(max(r2scores))
    highest_r2_score = r2scores[highest_r2_index]

    # Get the corresponding subset, params and model
    corr_subset = subsets[highest_r2_index]
    corr_params = params[highest_r2_index]
    corr_model = models[highest_r2_index]

    # Show the results
    print(df_name)
    print("Subset: ", corr_subset)
    print("R2 score: ", highest_r2_score)
    display(corr_model)

    # Define directory where params and model will be saved
    params_directory = Path('models/specs/')
    model_directory = Path('models/model/')

    # Get model name
    name = type(corr_model).__name__

    # Generate file
    params_file = params_directory / f"{df_name}_{name}_hyperparams.pkl"
    model_file = model_directory / f"{df_name}_{name}_model.pkl"

    # Save hyperparams
    with open(params_file, 'wb') as f:
        joblib.dump(corr_params, f)

    # Save Model
    with open(model_file, 'wb') as f:
        joblib.dump(corr_model, f)



Current selected features:  ['O']


Current selected features:  ['C']
Current selected features:  ['E']
Current selected features:  ['A']
Current selected features:  ['N']
Current selected features:  ['O', 'C']
Current selected features:  ['O', 'E']
Current selected features:  ['O', 'A']
Current selected features:  ['O', 'N']
Current selected features:  ['C', 'E']
Current selected features:  ['C', 'A']
Current selected features:  ['C', 'N']
Current selected features:  ['E', 'A']
Current selected features:  ['E', 'N']
Current selected features:  ['A', 'N']
Current selected features:  ['O', 'C', 'E']
Current selected features:  ['O', 'C', 'A']
Current selected features:  ['O', 'C', 'N']
Current selected features:  ['O', 'E', 'A']
Current selected features:  ['O', 'E', 'N']
Current selected features:  ['O', 'A', 'N']
Current selected features:  ['C', 'E', 'A']
Current selected features:  ['C', 'E', 'N']
Current selected features:  ['C', 'A', 'N']
Current selected features:  ['E', 'A', 'N']
Current selected features:  ['O', 

Current selected features:  ['O']
Current selected features:  ['C']
Current selected features:  ['E']
Current selected features:  ['A']
Current selected features:  ['N']
Current selected features:  ['O', 'C']
Current selected features:  ['O', 'E']
Current selected features:  ['O', 'A']
Current selected features:  ['O', 'N']
Current selected features:  ['C', 'E']
Current selected features:  ['C', 'A']
Current selected features:  ['C', 'N']
Current selected features:  ['E', 'A']
Current selected features:  ['E', 'N']
Current selected features:  ['A', 'N']
Current selected features:  ['O', 'C', 'E']
Current selected features:  ['O', 'C', 'A']
Current selected features:  ['O', 'C', 'N']
Current selected features:  ['O', 'E', 'A']
Current selected features:  ['O', 'E', 'N']
Current selected features:  ['O', 'A', 'N']
Current selected features:  ['C', 'E', 'A']
Current selected features:  ['C', 'E', 'N']
Current selected features:  ['C', 'A', 'N']
Current selected features:  ['E', 'A', 'N']


In [321]:
display(r2scores)

[0.04967863632709255,
 0.10026002883953308,
 0.16501264937169707,
 0.14950487413117128,
 0.23086899477958633,
 0.32183989805928015,
 0.34236858130074443,
 0.3282161872371328,
 0.3405802573760292,
 0.331105571886361,
 0.28939576263924194,
 0.3342757818236769,
 0.3226040904606832,
 0.33999603794285693,
 0.30014332601292193,
 0.3346921835579253,
 0.3318532201412035,
 0.33385092340097,
 0.3415571313739855,
 0.33340919918625256,
 0.33867232564078736,
 0.33208659186857525,
 0.33939725916339664,
 0.3356480392227703,
 0.33224268668230394,
 0.34470539551477686,
 0.33537957299183213,
 0.33495189538263537,
 0.33228986586597087,
 0.3390341802469118,
 0.3357710200190136]

In [22]:
model = joblib.load('models/model/Inner_concerns_Pipeline_model.pkl')
display(model)