# Hyperparameter Tuning

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

### Read in Data

In [2]:
data = pd.read_csv("C:\\Users\\danij\\Documents\\UC3M\\TFG\\DATA\\data_cleaned_features.csv")

data.head()

Unnamed: 0,tweet_id,therapy,label,cleaned_text,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,stopword_count,body_len,compound_score,punct%,positive_score,negative_score,neutral_score.1
0,1454224517895688192,adderall,neutral,wait get adderall prescription imma time every...,4.692308,0.153846,0.0,1.0,29,61,0.0,1.6,0.0,0.0,1.0
1,1426258820376842243,oxycodone,negative,sassychickie kellyrdc fentanyl oxycontin oxyco...,6.846154,0.230769,0.0,1.0,30,89,0.0,10.1,0.0,0.0,1.0
2,1473007602170798082,cbd,neutral,fun juggling act mine taking adderall drinking...,4.545455,0.136364,0.0,0.571,43,100,0.6249,1.0,0.331,0.097,0.571
3,1561156143405502466,percocet,neutral,percocet roxycodone xanax crushed dust elevate...,4.16,0.08,0.0,0.781,57,105,-0.4215,0.0,0.0,0.219,0.781
4,1559923718578741248,adderall,negative,first day adderall feel,4.75,0.375,0.0,1.0,14,38,0.0,0.0,0.0,0.0,1.0


### Convert label to numeric

In [3]:
sentiment_label = {'neutral': 0, 'positive': 1, 'negative': 2}

data['num_label'] = data['label'].map(sentiment_label)

data.head()

Unnamed: 0,tweet_id,therapy,label,cleaned_text,avg_word_length,sia_positive_word_rate,sia_negative_word_rate,neutral_score,stopword_count,body_len,compound_score,punct%,positive_score,negative_score,neutral_score.1,num_label
0,1454224517895688192,adderall,neutral,wait get adderall prescription imma time every...,4.692308,0.153846,0.0,1.0,29,61,0.0,1.6,0.0,0.0,1.0,0
1,1426258820376842243,oxycodone,negative,sassychickie kellyrdc fentanyl oxycontin oxyco...,6.846154,0.230769,0.0,1.0,30,89,0.0,10.1,0.0,0.0,1.0,2
2,1473007602170798082,cbd,neutral,fun juggling act mine taking adderall drinking...,4.545455,0.136364,0.0,0.571,43,100,0.6249,1.0,0.331,0.097,0.571,0
3,1561156143405502466,percocet,neutral,percocet roxycodone xanax crushed dust elevate...,4.16,0.08,0.0,0.781,57,105,-0.4215,0.0,0.0,0.219,0.781,0
4,1559923718578741248,adderall,negative,first day adderall feel,4.75,0.375,0.0,1.0,14,38,0.0,0.0,0.0,0.0,1.0,2


### Split into train, validation, and test set

In [19]:
# Divide variables into features and labels

# Keep only numeric features
features = data[['tweet_id', 'num_label', 
     'avg_word_length', 'sia_positive_word_rate', 'sia_negative_word_rate', 'neutral_score', 'stopword_count', 
     'body_len', 'compound_score', 'punct%', 'positive_score', 'negative_score', 'neutral_score']]

labels = data['num_label']

# First split into train(60%) and test(40%), as we can only split dataset into 2
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.4, random_state=42)

# Now split test(40%) into test(20%) and validation(20%)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [20]:
print(len(X_train))
print(len(y_train))

1805
1805


In [21]:
# Check if we splitted correctly
for dataset in [y_train, y_val, y_test]:
    print(round(len(dataset) / len(labels), 2))

0.6
0.2
0.2


### Perform GridSearchCV

In [22]:
# Function to show performance of hyperparameters
def print_results(results):
    # Print the best parameters found during the cross-validation
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    # Extract the mean and standard deviation of the test scores from the results object
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']

    # Iterate over the means, stds, and params simultaneously using the zip function
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        # Print the mean score, the range (mean +/- 2 * std), and the corresponding parameters
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [23]:
# Try different models and see their performance

# Create an instance of the Random Forest Classifier
rf = RandomForestClassifier()

# Define the parameters to be tuned in the grid search
parameters = {
    'n_estimators': [5, 50, 100, 200],
    'max_depth': [2, 10, 20, None]
}

# Create an instance of GridSearchCV with the Random Forest Classifier and parameter grid
cv = GridSearchCV(rf, parameters, cv=5)

# Fit the training features and labels to the grid search cross-validation
cv.fit(X_train, y_train.values.ravel())

# Print the results of the grid search cross-validation
print_results(cv)

BEST PARAMS: {'max_depth': 10, 'n_estimators': 100}

0.837 (+/-0.192) for {'max_depth': 2, 'n_estimators': 5}
0.781 (+/-0.083) for {'max_depth': 2, 'n_estimators': 50}
0.801 (+/-0.092) for {'max_depth': 2, 'n_estimators': 100}
0.807 (+/-0.079) for {'max_depth': 2, 'n_estimators': 200}
0.975 (+/-0.036) for {'max_depth': 10, 'n_estimators': 5}
0.999 (+/-0.002) for {'max_depth': 10, 'n_estimators': 50}
1.0 (+/-0.0) for {'max_depth': 10, 'n_estimators': 100}
1.0 (+/-0.0) for {'max_depth': 10, 'n_estimators': 200}
0.977 (+/-0.022) for {'max_depth': 20, 'n_estimators': 5}
1.0 (+/-0.0) for {'max_depth': 20, 'n_estimators': 50}
1.0 (+/-0.0) for {'max_depth': 20, 'n_estimators': 100}
1.0 (+/-0.0) for {'max_depth': 20, 'n_estimators': 200}
0.988 (+/-0.02) for {'max_depth': None, 'n_estimators': 5}
1.0 (+/-0.0) for {'max_depth': None, 'n_estimators': 50}
1.0 (+/-0.0) for {'max_depth': None, 'n_estimators': 100}
1.0 (+/-0.0) for {'max_depth': None, 'n_estimators': 200}


In [24]:
cv.best_estimator_

RandomForestClassifier(max_depth=10)