In [1]:
import models
import tensorflow as tf
import pandas as pd 
import numpy as np
import kagglehub

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, PredefinedSplit
import itertools

from typing import List, Dict, Any

%load_ext autoreload
%autoreload

## General Setup

In [2]:
# Load Data
path = kagglehub.dataset_download('abhi8923shriv/sentiment-analysis-dataset')
train_dataset = path+'/train.csv'
test_dataset = path+'/test.csv'
train_df = pd.read_csv(train_dataset, encoding='ISO-8859-1')
test_df = pd.read_csv(test_dataset, encoding='ISO-8859-1')

In [3]:
# Basic Preprocessing (need to update with preprocessor)
train = train_df.dropna(subset = "text")[["text", "sentiment"]]
test = test_df.dropna()[["text", "sentiment"]]

x = train["text"].values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train["sentiment"].values)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, stratify=y, random_state=42, test_size=0.3, shuffle=True)

vectorizer = TfidfVectorizer()
x_train_transformed = vectorizer.fit_transform(x_train).toarray()
x_valid_transformed = vectorizer.transform(x_valid).toarray()

y_train = tf.keras.utils.to_categorical(y_train, num_classes=3)
y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=3)

In [15]:
print(y_train[:5])

[[0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [None]:
%autoreload
# base layers
input_layer = ("input", {"shape": (x_train_transformed.shape[1], )})
dense_256 = ("dense", {"units": 256, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dropout = ("dropout") 
dense_128 = ("dense", {"units": 128, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
dense_64 = ("dense", {"units": 64, "kernel_regularizer": tf.keras.regularizers.l2(0.01), "activation":"relu"})
output_layer = ("output", {"units": 3, "activation": "softmax"})

# Example of how to assemble the model
# inner_list = [dense_256, dense_128, dense_64]
# l = [input_layer] + inner_list + [output_layer]

## Init permutations

In [5]:
permutations = list(itertools.product([256, 128, 64], repeat=3))
permutations_lt_512 = []
for p in permutations:
    if sum(p) < 512:
        permutations_lt_512 += [p]
permutations_lt_512

[(256, 128, 64),
 (256, 64, 128),
 (256, 64, 64),
 (128, 256, 64),
 (128, 128, 128),
 (128, 128, 64),
 (128, 64, 256),
 (128, 64, 128),
 (128, 64, 64),
 (64, 256, 128),
 (64, 256, 64),
 (64, 128, 256),
 (64, 128, 128),
 (64, 128, 64),
 (64, 64, 256),
 (64, 64, 128),
 (64, 64, 64)]

In [6]:
drop_pos = list(itertools.product([False, True], repeat=3))
drop_pos

[(False, False, False),
 (False, False, True),
 (False, True, False),
 (False, True, True),
 (True, False, False),
 (True, False, True),
 (True, True, False),
 (True, True, True)]

In [7]:
full_layer_permuations = [list(itertools.chain.from_iterable(zip(p, d))) for p in permutations_lt_512 for d in drop_pos]

In [None]:
def create_layers_from_permutation(input_layer, output_layer, permutation):
    l = [input_layer]
    for i in range(len(permutation)):
        match permutation[i]:
            case 256:
                l += [dense_256]
            case 128:
                l += [dense_128]
            case 64:
                l += [dense_64]
            case True:
                l += [dropout]
            case False:
                pass
    l += [output_layer]
    return l


## Param Grid
Searching for layer permutations, batch sizes and dropout rates.

In [31]:
full_layer_permuations = [create_layers_from_permutation(input_layer, output_layer, p) for p in full_layer_permuations]
param_grid = {
    "layer_list": full_layer_permuations,
    "batch_size": [16, 32, 64, 128, 256],
    "dropout_rate": [0.1, 0.2, 0.3, 0.4]
}

In [9]:
full_layer_permuations[0: 2]

[[('input', {'shape': (21156,)}),
  ('dense',
   {'units': 256,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x210739f8790>,
    'activation': 'relu'}),
  ('dense',
   {'units': 128,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x21073a79990>,
    'activation': 'relu'}),
  ('dense',
   {'units': 64,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x21073c0bcd0>,
    'activation': 'relu'}),
  ('output', {'units': 3, 'activation': 'softmax'})],
 [('input', {'shape': (21156,)}),
  ('dense',
   {'units': 256,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x210739f8790>,
    'activation': 'relu'}),
  ('dense',
   {'units': 128,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x21073a79990>,
    'activation': 'relu'}),
  ('dense',
   {'units': 64,
    'kernel_regularizer': <keras.src.regularizers.regularizers.L2 at 0x21073c0bcd0>,
    'activation': 'relu'}),
  'dropout',
  ('outpu

## Random Search

In [32]:
# Manual Val Set
split_index = [-1]*len(x_train_transformed) + [0]*len(x_valid_transformed)
X = np.concatenate((x_train_transformed, x_valid_transformed), axis=0)
y = np.concatenate((y_train, y_valid), axis=0)
pds = PredefinedSplit(test_fold = split_index)

In [28]:
%autoreload
rscv = RandomizedSearchCV(models.TfModel(full_layer_permuations[0], batch_size=32), param_grid, scoring="f1_macro", cv=pds, n_iter=15)

In [29]:
rscv.fit(X, y)

Epoch 1/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 32ms/step - accuracy: 0.4072 - loss: 2.9265 - learning_rate: 1.0000e-04
Epoch 2/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.4588 - loss: 1.1807 - learning_rate: 9.0484e-05
Epoch 3/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.5884 - loss: 1.0345 - learning_rate: 8.1873e-05
Epoch 4/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.6735 - loss: 0.9849 - learning_rate: 7.4082e-05
Epoch 5/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.7000 - loss: 0.9511 - learning_rate: 6.7032e-05
Epoch 6/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 32ms/step - accuracy: 0.7116 - loss: 0.9396 - learning_rate: 6.0653e-05
Epoch 7/10
[1m1203/1203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 33ms/ste

In [30]:
rscv.best_estimator_

## Findings
Higher dropout, lower batch size, not sure about layer permutations (not enough iters)