In [4]:
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [5]:
df_pos = pd.read_csv(open('clean_positive_train.csv','r'), encoding='utf-8', engine='c')
df_neg = pd.read_csv(open('clean_negative_train.csv','r'), encoding='utf-8', engine='c')

df_pos['text'] = df_pos['text'].astype(str)
df_pos['parent_text'] = df_pos['parent_text'].astype(str)

df_neg['text'] = df_neg['text'].astype(str)
df_neg['parent_text'] = df_neg['parent_text'].astype(str)

In [6]:
df_pos.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,99999.0,99999.0,99999.0,99999.0,99999.0,99999.0
mean,1.0,198.155082,2e-05,0.99077,368.97307,0.00064
std,0.0,256.334734,0.004472,0.095629,535.679712,0.02529
min,1.0,66.0,0.0,0.0,-8907.0,0.0
25%,1.0,83.0,0.0,1.0,84.0,0.0
50%,1.0,116.0,0.0,1.0,184.0,0.0
75%,1.0,200.0,0.0,1.0,417.0,0.0
max,1.0,5488.0,1.0,1.0,9531.0,1.0


In [7]:
df_neg.describe()

Unnamed: 0,score,ups,controversiality,parent_score,parent_ups,parent_controversiality
count,99998.0,99998.0,99998.0,99998.0,99998.0,99998.0
mean,0.0,-14.583632,0.00149,0.912518,67.440239,0.00272
std,0.0,15.649932,0.038572,0.282541,219.047635,0.052083
min,0.0,-1077.0,0.0,0.0,-1622.0,0.0
25%,0.0,-15.0,0.0,1.0,6.0,0.0
50%,0.0,-10.0,0.0,1.0,15.0,0.0
75%,0.0,-8.0,0.0,1.0,44.0,0.0
max,0.0,-6.0,1.0,1.0,14776.0,1.0


In [8]:
df = pd.concat([df_pos, df_neg])
df = df.sample(frac=1).reset_index(drop=True)
df.dropna(axis=0, inplace=True)
df['combined'] = df[['text', 'parent_text']].apply(lambda x: ' '.join(x), axis=1)

text_data = df['combined']
text_score = df['score']
X_train, X_test, y_train, y_test = train_test_split(text_data,text_score, test_size = 0.20, random_state = 42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

X_train: (159997,)
X_test: (40000,)
y_train: (159997,)
y_test: (40000,)


In [9]:
# Токенизируем текст
tokenizer = Tokenizer(num_words=10000, lower=True, split=' ', document_count=0)
tokenizer.fit_on_texts(text_data)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [10]:
#каждое предложение имеет разную длину, и мы хотим каждый раз передавать один и тот же вектор длины в нашу нейронную сеть,
#мы дополняем их, добавляя нули в конце каждой последовательности, поэтому каждое из них имеет длину 128 целых чисел.
train_data = keras.preprocessing.sequence.pad_sequences(X_train_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

test_data = keras.preprocessing.sequence.pad_sequences(X_test_seq,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=128)

In [11]:
print(len(train_data[0]), len(train_data[1]))

128 128


In [12]:
#делаем нейросеть
vocab_size = len(tokenizer.word_index) + 1

model = tf.keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, 4))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(2, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

W0404 21:50:08.030283 139873078146880 deprecation.py:506] From /home/evgenia/CourseraEnv/env/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0404 21:50:08.056196 139873078146880 deprecation.py:506] From /home/evgenia/CourseraEnv/env/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 4)           868984    
_________________________________________________________________
global_average_pooling1d (Gl (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 2)                 10        
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 3         
Total params: 868,997
Trainable params: 868,997
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [14]:
history = model.fit(train_data,
                    y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(test_data, y_test),
                    verbose=1)

Train on 159997 samples, validate on 40000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [16]:
from sklearn.metrics import mean_squared_error
y_pred = np.full(y_test.shape, y_train.mean())
mean_squared_error(y_test, y_pred)

0.2500020158329875