### Model 3: GRU Model met Dropout

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('data/dataset/training.csv.gz')
#df_test = pd.read_csv('data/dataset/testing.csv.gz')
df_val = pd.read_csv('data/dataset/validation.csv.gz')

In [3]:
# Functie om het aantal woorden in een tekst te tellen
def count_words_in_text(text):
    return len(text.split())

# Functie om de data te filteren en alleen teksten met exact 600 woorden te behouden
def filter_exact_600_word_texts(df, word_limit=600):
    # Filter de data op teksten die exact 600 woorden bevatten
    filtered_df = df[df['txt'].apply(lambda x: count_words_in_text(x) == word_limit)]
    return filtered_df

In [4]:
# Pas de filterfunctie toe op df_train en df_val
df_train_filtered = filter_exact_600_word_texts(df_train)
df_val_filtered = filter_exact_600_word_texts(df_val)

In [5]:
# Stap 1: Tokenization en Padding voorbereiden
max_words = 10000
maxlen = 600  # Aangezien je alle teksten hebt gefilterd om precies 600 woorden te bevatten

# Tokenizer aanmaken en trainen op de gefilterde train data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train_filtered['txt'])

# Zet de tekst om in tokens en pad de sequences tot maxlen = 600
sequences_train = tokenizer.texts_to_sequences(df_train_filtered['txt'])
padded_train = pad_sequences(sequences_train, maxlen=maxlen)

# Doe hetzelfde voor de validatie dataset
sequences_val = tokenizer.texts_to_sequences(df_val_filtered['txt'])
padded_val = pad_sequences(sequences_val, maxlen=maxlen)

In [6]:
# Stap 2: Label Encoding voor de genre labels
label_encoder = LabelEncoder()
df_train_filtered.loc[:, 'genre_nr'] = label_encoder.fit_transform(df_train_filtered['main genre'])
df_val_filtered.loc[:, 'genre_nr'] = label_encoder.transform(df_val_filtered['main genre'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_filtered.loc[:, 'genre_nr'] = label_encoder.fit_transform(df_train_filtered['main genre'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val_filtered.loc[:, 'genre_nr'] = label_encoder.transform(df_val_filtered['main genre'])


In [7]:
import numpy as np
from tensorflow.keras.utils import Sequence

# Generator class voor batch-wise data loading
class DataGenerator(Sequence):
    def __init__(self, texts, labels, batch_size, tokenizer, maxlen=600, max_words=10000, shuffle=True):
        self.texts = texts
        self.labels = labels
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.maxlen = maxlen
        self.max_words = max_words
        self.indexes = np.arange(len(self.texts))
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def __len__(self):
        # Aantal batches per epoch
        return int(np.floor(len(self.texts) / self.batch_size))
    
    def __getitem__(self, index):
        # Haal een batch indexen op
        batch_indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
        
        # Haal de corresponderende teksten en labels op
        batch_texts = [self.texts[k] for k in batch_indexes]
        batch_labels = [self.labels[k] for k in batch_indexes]
        
        # Tokenize en pad de teksten in deze batch
        sequences = self.tokenizer.texts_to_sequences(batch_texts)
        padded_sequences = pad_sequences(sequences, maxlen=self.maxlen)
        
        return np.array(padded_sequences), np.array(batch_labels)
    
    def on_epoch_end(self):
        # Schud de indexen aan het einde van elke epoch indien nodig
        if self.shuffle:
            np.random.shuffle(self.indexes)

# Initialiseer de tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train_filtered['txt'])

# Maak de data generatoren voor training en validatie
train_generator = DataGenerator(
    texts=df_train_filtered['txt'].values, 
    labels=df_train_filtered['genre_nr'].values, 
    batch_size=32, 
    tokenizer=tokenizer
)

val_generator = DataGenerator(
    texts=df_val_filtered['txt'].values, 
    labels=df_val_filtered['genre_nr'].values, 
    batch_size=32, 
    tokenizer=tokenizer
)

In [11]:
# Stap 3: Maak het model
model = models.Sequential([
    layers.Embedding(input_dim=100_000, output_dim=7, input_length=600),
    layers.Dropout(.5),
    layers.Flatten(),
    layers.Dropout(.8),
    layers.Dense(7, activation='softmax')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 600, 7)            700000    
                                                                 
 dropout_1 (Dropout)         (None, 600, 7)            0         
                                                                 
 flatten (Flatten)           (None, 4200)              0         
                                                                 
 dropout_2 (Dropout)         (None, 4200)              0         
                                                                 
 dense_2 (Dense)             (None, 7)                 29407     
                                                                 
Total params: 729,407
Trainable params: 729,407
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Stap 4: Compileer het model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
# Train het model met de generatoren
history = model.fit(
    train_generator,
    epochs=10,
    validation_data=val_generator
)

Epoch 1/10
Epoch 2/10

In [None]:
# Stap 6: Evalueer het model
val_loss, val_accuracy = model.evaluate(padded_val, df_val_filtered['genre_nr'])
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")