In [None]:
import numpy as np
import tensorflow as tf
print(tf.__version__)

import matplotlib.pyplot as plt
%matplotlib inline

2.4.1


In [None]:
import pandas as pd

train_data = pd.read_csv('/content/train_data.txt', delimiter="\t", header=None, sep=" ::: ", names=['ID','TITLE', 'GENRE','DESCRIPTION'])
train_data = pd.DataFrame(train_data["ID"].str.split(' ::: ').tolist())
train_data.columns = ['ID','TITLE', 'GENRE','DESCRIPTION']

In [None]:
train_data = train_data.drop(['ID', "TITLE"], axis=1)

In [None]:
test_data = pd.read_csv('/content/test_data.txt', delimiter="\t", header=None, sep=" ::: ", names=['ID','TITLE', 'GENRE','DESCRIPTION'])
test_data = pd.DataFrame(test_data["ID"].str.split(' ::: ').tolist())
test_data.columns = ['ID','TITLE','DESCRIPTION']

In [None]:
test_data = test_data.drop(['ID'], axis=1)
test_data

Unnamed: 0,TITLE,DESCRIPTION
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [None]:
# первый вариант препроцессинга

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

train2 = train_data['DESCRIPTION'].values

tokenize = Tokenizer(num_words = 10000)
tokenize.fit_on_texts(train2)

sequences = tokenize.texts_to_sequences(train_data['DESCRIPTION'])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=150)

In [None]:
# второй вариант препроцессинга

import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords



REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):

    text = text.lower() 

    text = '<START>' + text

    text = REPLACE_BY_SPACE_RE.sub(' ', text) 

    text = text.replace('x', '') 
    
    text = REMOVE_NUM.sub('', text)

    text = BAD_SYMBOLS_RE.sub('', text) 

    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    
    text = ' '.join([word for word in text.split() if word not in string.punctuation])

    #text = ' '.join([stemmer.stem(word) for word in text.split()])

    return text

train_data['TOKENS'] = train_data['DESCRIPTION'].apply(clean_text)

train = train_data['TOKENS'].values

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 20000

tokenizer = Tokenizer(num_words=MAX_LEN, lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(train)
sequences = tokenizer.texts_to_sequences(train)
x = pad_sequences(sequences, padding='post', maxlen=150)

In [None]:
test_data['sequences'] = tokenizer.texts_to_sequences(test_data['DESCRIPTION'])

In [None]:
test_data

Unnamed: 0,TITLE,DESCRIPTION,sequences
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart...","[4657, 4349, 1, 535, 1, 2, 1, 223, 1, 544, 1, ..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi...","[1780, 1753, 1, 1, 1, 1, 1, 9168, 148, 1, 37, ..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...,"[3, 59, 1, 8702, 2, 1, 1, 1, 1, 11, 1, 12897, ..."
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his...","[1, 23, 1, 566, 1, 1, 4127, 1, 1, 112, 1, 1, 1..."
4,Er nu zhai (1955),Before he was known internationally as a marti...,"[1, 1, 1, 182, 4338, 1, 1, 2092, 1040, 3547, 3..."
...,...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar...","[3708, 1783, 5843, 1583, 1, 444, 230, 1, 1, 59..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...,"[1, 1602, 1, 8531, 1, 393, 1, 16, 1, 23, 1, 56..."
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,...","[1, 97, 1, 14, 1, 8702, 179, 3241, 2031, 8702,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ...","[451, 1, 224, 409, 1, 2807, 855, 1, 1, 1, 8799..."


In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

train_data['GENRE'] = le.fit_transform(train_data['GENRE'])
train_labels = tf.convert_to_tensor(train_data['GENRE'].values)

train_labels

<tf.Tensor: shape=(54214,), dtype=int64, numpy=array([ 8, 24,  1, ...,  7,  5, 12])>

In [None]:
EMB_SIZE = 32
    
model = tf.keras.Sequential([
                             
    tf.keras.layers.Embedding(20000, EMB_SIZE, input_length=150),

    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),  

    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(64, activation=tf.nn.relu),
    
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(27, activation=tf.nn.softmax), # меняю на sigmoid для one-hot лейблов , 

])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 32)           640000    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               164864    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 27)                1755      
Total params: 823,067
Trainable params: 823,067
Non-trainable params: 0
__________________________________________________

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001, epsilon=1e-07),
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

x_val = x[:8000]
partial_x_train = x[8000:]
y_val = train_labels[:8000]
partial_y_train = train_labels[8000:]

In [None]:
BATCH_SIZE = 512
NUM_EPOCHS = 20


history = model.fit(partial_x_train,
                    partial_y_train,
                    epochs=NUM_EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(x_val, y_val),
 #                   callbacks=[tensorboard_callback],
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
