In [30]:
import re
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from nltk.corpus import stopwords

epochs = 10
batch_size = 32
embedding_dim = 50
prediction_file = 'prediction.csv'

In [18]:
# Загрузка данных
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.drop(['id', 'keyword', 'location'], axis= 1)
test_data.drop(['id', 'keyword', 'location'], axis= 1)

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan
...,...
3258,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,Storm in RI worse than last hurricane. My city...
3260,Green Line derailment in Chicago http://t.co/U...
3261,MEG issues Hazardous Weather Outlook (HWO) htt...


In [19]:
# Предобработка данных
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Удаление ссылок
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Удаление упоминаний
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Удаление специальных символов
    return text

train_data['text'] = train_data['text'].apply(lambda x: clean_text(x))
test_data['text'] = test_data['text'].apply(lambda x: clean_text(x))

In [20]:
# Удаление стоп-слов
stop_words = set(stopwords.words('english'))
for i, row in train_data.iterrows():
    text = row['text']
    text = ' '.join([word for word in text.split() if word not in stop_words])
    train_data.at[i, 'text'] = text

for i, row in test_data.iterrows():
    text = row['text']
    text = ' '.join([word for word in text.split() if word not in stop_words])
    test_data.at[i, 'text'] = text

In [21]:
# Создание токенизатора
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
vocab_size = len(tokenizer.word_index) + 1

In [22]:
# Преобразование текста в последовательности чисел
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])
target = train_data['target'].values
test_ids = test_data['id'].values

In [23]:
# Выравнивание полседовательностей к одной длине
max_length = 50  # Максимальная длина последовательности
train_data = pad_sequences(train_sequences, maxlen=max_length)
test_data = pad_sequences(test_sequences, maxlen=max_length)

In [26]:
# Создание модели
model = Sequential() 
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
# Обучение модели
model.fit(train_data, target, batch_size=batch_size, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x152789a2710>

In [28]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 50)            746450    
                                                                 
 bidirectional_12 (Bidirecti  (None, 50, 128)          58880     
 onal)                                                           
                                                                 
 bidirectional_13 (Bidirecti  (None, 128)              98816     
 onal)                                                           
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 904,275
Trainable params: 904,275
Non-trainable params: 0
_________________________________________________________________


In [31]:
# Предсказание на тестовых данных и сохранение результатов
predictions = model.predict(test_data)
submission = pd.DataFrame()
submission['id'] = test_ids
submission['target'] = predictions.round().astype(int).reshape(3263)
submission.to_csv(prediction_file, index=False)

