This notebook creates and trains a simple RNN network for classifiyng podcast using the textual description.

In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
from keras import models, layers
from keras import preprocessing
from keras_preprocessing.image import ImageDataGenerator
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import MultiLabelBinarizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/podcast/appledataset/podcast_final.csv')

In [None]:
df.head(6)

Unnamed: 0,Name,Artwork,Primary Genre,Description,filepaths
0,Firearms Radio Network (All Shows),https://is1-ssl.mzstatic.com/image/thumb/Podca...,Nature,Master Feed,/content/drive/MyDrive/Colab_Notebooks/podcast...
1,Clear and Present Danger - A history of free s...,https://is4-ssl.mzstatic.com/image/thumb/Podca...,Society & Culture,"Why have kings, emperors, and governments kill...",/content/drive/MyDrive/Colab_Notebooks/podcast...
2,2 Idiots and a List,https://is4-ssl.mzstatic.com/image/thumb/Podca...,Music,"Welcome to “2 Idiots and a List”, the podcast ...",/content/drive/MyDrive/Colab_Notebooks/podcast...
3,Forever35,https://is5-ssl.mzstatic.com/image/thumb/Podca...,Fashion & Beauty,Which night creams will make you look well-res...,/content/drive/MyDrive/Colab_Notebooks/podcast...
4,DarbyCast,https://is2-ssl.mzstatic.com/image/thumb/Podca...,Fiction,"Neither the Darbyshire book series, nor this p...",/content/drive/MyDrive/Colab_Notebooks/podcast...
5,Inside Running Podcast,https://is2-ssl.mzstatic.com/image/thumb/Podca...,Sports,"A weekly podcast by 3 fast runners, for all ru...",/content/drive/MyDrive/Colab_Notebooks/podcast...


In [None]:
from sklearn.utils import shuffle
df = shuffle(df, random_state = 42)
df['Primary Genre'] = [[el]for el in df['Primary Genre'].tolist()]
df_train = df[:28279]
df_test = df[28279:]
df_train, df_valid = train_test_split(df_train, shuffle = True, train_size=0.8)

Textual data was pre-processed in order to normalize it. Each description was tokenized, lowered, removed stopwords and lemmatize. This allow to extract the semantic meaning of each word getting rid of the noise. 

In [None]:
stopw = stopwords.words('english')
tokenizer = nltk.RegexpTokenizer(r"\w+")

lemmatizer = WordNetLemmatizer()
def clean_text(text):
  text = text.lower().strip()
  text = tokenizer.tokenize(text)
  clean_text = [w for w in text if w.lower() not in stopw]
  clean_text = " ".join([lemmatizer.lemmatize(w) for w in clean_text])
  return clean_text

In [None]:
df_train['Description'] = [clean_text(el) for el in df_train['Description'].tolist()]
df_valid['Description'] = [clean_text(el) for el in df_valid['Description'].tolist()]
df_test['Description'] = [clean_text(el) for el in df_test['Description'].tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(df['Primary Genre'].tolist())
mlb.classes_

array(['After Shows', 'Animation & Manga', 'Arts', 'Books', 'Business',
       'Comedy', 'Design', 'Documentary', 'Education', 'Fashion & Beauty',
       'Fiction', 'Food', 'Games', 'Health', 'Hobbies', 'Interviews',
       'Kids & Family', 'Music', 'Nature', 'News', 'Non-Profit',
       'Personal', 'Pets & Animals', 'Places & Travel', 'Politics',
       'Religion', 'Science', 'Sexuality', 'Society & Culture', 'Sports',
       'Stand-Up', 'TV & Film', 'Technology', 'True Crime', 'Vehicles'],
      dtype=object)

In [None]:
genres = df['Primary Genre'].tolist()
transformed_labels = mlb.transform(genres)
transformed_labels.shape

(29279, 35)

In [None]:
train_labels = mlb.transform(df_train['Primary Genre'].tolist())
train_labels.shape

val_labels = mlb.transform(df_valid['Primary Genre'].tolist())
val_labels.shape

test_labels = mlb.transform(df_test['Primary Genre'].tolist())
test_labels.shape

(1000, 35)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.layers import Embedding, LSTM, Dropout, Dense, Input, Bidirectional, Flatten, Conv2D, MaxPooling2D, concatenate, Conv1D, MaxPooling1D
import keras.backend as K
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping

The corpus was prepared for training using the standard methods of keras. It is necessary to obtain the number of unique tokens, creating a dictionary that can be used to map each token with a numerical ID. Moreover, it is necessary to have the maximum length of a sample.

In [None]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = df['Description'].map(len).max()
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True)
tokenizer.fit_on_texts(df['Description'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print('Max len:', MAX_SEQUENCE_LENGTH)

Found 96388 unique tokens.
Max len: 3979


In [None]:
X_text_train = tokenizer.texts_to_sequences(df_train['Description'].values)
X_text_train = pad_sequences(X_text_train, maxlen=MAX_SEQUENCE_LENGTH) #include padding to make each instance of the same length
print('Shape of train tensor:', X_text_train.shape)

Shape of train tensor: (22623, 3979)


In [None]:
X_text_test = tokenizer.texts_to_sequences(df_test['Description'].values)
X_text_test = pad_sequences(X_text_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train tensor:', X_text_test.shape)

Shape of train tensor: (1000, 3979)


In [None]:
X_text_val = tokenizer.texts_to_sequences(df_valid['Description'].values)
X_text_val = pad_sequences(X_text_val, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of train tensor:', X_text_val.shape)

Shape of train tensor: (5656, 3979)


A simple RNN network to process text was made using GRU layers instead of regular RNN ones.

In [None]:
model = keras.models.Sequential([
keras.layers.Embedding(len(word_index), EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, input_shape=[None], trainable=False),
keras.layers.GRU(128), return_sequences=True)
keras.layers.Dropout(0.3),
keras.layers.GRU(128),
keras.layers.Dropout(0.5),
keras.layers.Dense(35, activation="softmax")
], name='my_RNN')
model.summary()

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.Adam(),
metrics=["accuracy", tf.keras.metrics.TopKCategoricalAccuracy(5)])

In [None]:
CSV_log = tf.keras.callbacks.CSVLogger('/content/drive/MyDrive/Colab_Notebooks/podcast/RNN_log.csv', separator=",", append=True)
early_stopping_cb = keras.callbacks.EarlyStopping(patience=7,
restore_best_weights=True, monitor='val_loss')

model_checkpoint = keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_checkpoint.h5', monitor='val_loss', save_best_only = True, verbose = 1)
history = model.fit(X_text_train, train_labels, validation_data=(X_text_val, val_labels), epochs=50, callbacks=[early_stopping_cb, model_checkpoint, CSV_log])

Epoch 1/50
Epoch 1: val_loss improved from inf to 3.21423, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 2/50
Epoch 2: val_loss improved from 3.21423 to 2.98522, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 3/50
Epoch 3: val_loss improved from 2.98522 to 2.78358, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 4/50
Epoch 4: val_loss improved from 2.78358 to 2.61388, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 5/50
Epoch 5: val_loss improved from 2.61388 to 2.51620, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 6/50
Epoch 6: val_loss improved from 2.51620 to 2.42910, saving model to /content/drive/MyDrive/Colab_Notebooks/podcast/my_RNN_1layer:checkpoint.h5
Epoch 7/50
Epoch 7: val_loss improved from 2.42910 to 2.37610, saving model to /