This notebook evaluates the trained RNN model against the testing set

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from keras import models, layers
from keras import preprocessing
from keras_preprocessing.image import ImageDataGenerator
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.layers import Embedding, LSTM, Dropout, Dense, Input, Bidirectional, Flatten, Conv2D, MaxPooling2D, concatenate, Conv1D, MaxPooling1D

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/podcast/appledataset/podcast_final.csv')

In [3]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 42)
df['Primary Genre'] = [[el]for el in df['Primary Genre'].tolist()]
df_train = df[:28279]
df_test = df[28279:]
df_train, df_valid = train_test_split(df_train, shuffle = True, train_size=0.8)

In [4]:
model = keras.models.load_model('/content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5')

In [6]:
stopw = stopwords.words('english')
tokenizer = nltk.RegexpTokenizer(r"\w+")

lemmatizer = WordNetLemmatizer()
def clean_text(text):
  text = text.lower().strip()
  text = tokenizer.tokenize(text)
  clean_text = [w for w in text if w.lower() not in stopw]
  clean_text = " ".join([lemmatizer.lemmatize(w) for w in clean_text])
  return clean_text

In [7]:
df_train['Description'] = [clean_text(el) for el in df_train['Description'].tolist()]
df_valid['Description'] = [clean_text(el) for el in df_valid['Description'].tolist()]
df_test['Description'] = [clean_text(el) for el in df_test['Description'].tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
mlb = MultiLabelBinarizer()
mlb.fit(df['Primary Genre'].tolist())
mlb.classes_

array(['After Shows', 'Animation & Manga', 'Arts', 'Books', 'Business',
       'Comedy', 'Design', 'Documentary', 'Education', 'Fashion & Beauty',
       'Fiction', 'Food', 'Games', 'Health', 'Hobbies', 'Interviews',
       'Kids & Family', 'Music', 'Nature', 'News', 'Non-Profit',
       'Personal', 'Pets & Animals', 'Places & Travel', 'Politics',
       'Religion', 'Science', 'Sexuality', 'Society & Culture', 'Sports',
       'Stand-Up', 'TV & Film', 'Technology', 'True Crime', 'Vehicles'],
      dtype=object)

In [9]:
genres = df['Primary Genre'].tolist()
transformed_labels = mlb.transform(genres)
transformed_labels.shape

(29279, 35)

In [10]:
train_labels = mlb.transform(df_train['Primary Genre'].tolist())
train_labels.shape

val_labels = mlb.transform(df_valid['Primary Genre'].tolist())
val_labels.shape

test_labels = mlb.transform(df_test['Primary Genre'].tolist())
test_labels.shape

(1000, 35)

In [11]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = df['Description'].map(len).max()
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df['Description'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('Max len:', MAX_SEQUENCE_LENGTH)

Found 96388 unique tokens.
Max len: 3979


In [12]:
X_text_train = tokenizer.texts_to_sequences(df_train['Description'].values)
X_text_train = pad_sequences(X_text_train, maxlen=MAX_SEQUENCE_LENGTH) #include padding to make each instance of the same length
print('Shape of train tensor:', X_text_train.shape)
X_text_test = tokenizer.texts_to_sequences(df_test['Description'].values)
X_text_test = pad_sequences(X_text_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train tensor:', X_text_test.shape)
X_text_val = tokenizer.texts_to_sequences(df_valid['Description'].values)
X_text_val = pad_sequences(X_text_val, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train tensor:', X_text_val.shape)

Shape of train tensor: (22623, 3979)
Shape of train tensor: (1000, 3979)
Shape of train tensor: (5656, 3979)


In [13]:
model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.Adam(),
metrics=["accuracy", tf.keras.metrics.TopKCategoricalAccuracy(5)])

In [14]:
history_eval = model.evaluate(X_text_test, test_labels)



In [None]:
import json
history = {'loss': history_eval[0], 'accuracy': history_eval[1], 'top_5_accuracy': history_eval[2]}
print(history)
json.dump(history, open('/content/drive/MyDrive/Colab_Notebooks/podcast/logs/RNN_eval1.json', 'w'))

{'loss': 2.223956823348999, 'accuracy': 0.42399999499320984, 'top_5_accuracy': 0.6940000057220459}
