In [1]:
import pandas as pd
data_30_sentences = pd.read_csv(r'C:\Users\erich\Desktop\DS_project\data\cleaned_data.csv')

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

lemmatizer = WordNetLemmatizer()

def data_binary(df):
    
    df = df[df['HTML_Content'].isin(['Teen And Up Audiences', 'Mature'])]
    
    df = df.dropna(subset=['TXT_Content'])

    def clean_text(text):

        if not isinstance(text, str):
            text = str(text)

        text = re.sub('[^a-zA-Z]', ' ', text)
        text = text.lower()
        tokenized_text = word_tokenize(text)
        cleaned_text = [lemmatizer.lemmatize(word) for word in tokenized_text if word not in set(stopwords.words('english'))]

        return ' '.join(cleaned_text)

    df['TXT_Content'] = df['TXT_Content'].apply(clean_text)

    return df

data_30_sentences = data_binary(data_30_sentences)
print(data_30_sentences)



               HTML_Content                                        TXT_Content
0     Teen And Up Audiences  huckleberry finn stood front old wooden door m...
1     Teen And Up Audiences  right huck trying figure every sort way could ...
3     Teen And Up Audiences  air hot brow wet mind exhausted st petersburg ...
5     Teen And Up Audiences  rope burned skin rubbing flesh raw shoved push...
6     Teen And Up Audiences  tom sawyer liked consider hopeless romantic ot...
...                     ...                                                ...
5797                 Mature  sorry late robin blurted soon within earshot c...
5803  Teen And Up Audiences  seen news pat asked robin chance hang coat mor...
5804  Teen And Up Audiences  free next friday night murphy smiled phone rob...
5805                 Mature  turned walked away wan na say come baby give w...
5807                 Mature  heaven fact unlike earth pro con pro include t...

[2953 rows x 2 columns]


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, TimeDistributed, Flatten, Attention
from keras.models import Sequential
from keras.metrics import Precision, Recall
from keras.callbacks import EarlyStopping
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import time


# Pre-processing
max_length = 300
embedding_dim = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data_30_sentences['TXT_Content'])
sequences = tokenizer.texts_to_sequences(data_30_sentences['TXT_Content'])
word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=max_length)

labels = to_categorical(np.asarray(LabelEncoder().fit_transform(data_30_sentences['HTML_Content'])))
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Word Embedding
glove_input_file = 'C:\\Users\\erich\\Desktop\\DS_project\\code\\glove.6B\\glove.6B.100d.txt'
word2vec_output_file = 'C:\\Users\\erich\\Desktop\\DS_project\\code\\glove.6B\\glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

word2vec = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word] 

  glove2word2vec(glove_input_file, word2vec_output_file)


In [4]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], 
                    input_length=max_length, trainable=False))
model.add(LSTM(256, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(labels[0]), activation='softmax'))

model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', Precision(), Recall()])

early_stopping = EarlyStopping(monitor='val_loss', patience=10)

history_3 = model.fit(X_train, y_train, 
                    batch_size=128, epochs=3, 
                    validation_split=0.2, 
                    callbacks=[early_stopping])

model.reset_states()

history_10 = model.fit(X_train, y_train, 
                    batch_size=128, epochs=10, 
                    validation_split=0.2, 
                    callbacks=[early_stopping])

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
