<a href="https://colab.research.google.com/github/Briber162/MedAppUI/blob/master/fake_news_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.models import Sequential
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:

train_data = pd.read_csv("train.csv")

In [None]:
train_data.dropna(inplace=True)


In [None]:
ps = PorterStemmer()
sentence_length = 5000
padding_length = 1000
dimension = 100

In [None]:
def pre_process_data(data):
    text = data['title']
    sentence_list = list(map(lambda x: "". join(ps.stem(word) for word in re.sub("[^A-Za-z]", " ", x.lower()) if word not in stopwords.words("english")), text.values))
    encoded_sentences = [one_hot(sentence, sentence_length) for sentence in sentence_list]
    padded_sentences = pad_sequences(encoded_sentences, maxlen=padding_length, padding='pre')
    return padded_sentences

In [None]:
sen_list = pre_process_data(train_data)

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
#sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

Num GPUs Available:  1
Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1



In [None]:
model = Sequential()
model.add(Embedding(sentence_length, dimension, input_length=padding_length))
model.add(LSTM(500))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', metrics='accuracy', optimizer='adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         500000    
                                                                 
 lstm (LSTM)                 (None, 500)               1202000   
                                                                 
 dense (Dense)               (None, 1)                 501       
                                                                 
Total params: 1,702,501
Trainable params: 1,702,501
Non-trainable params: 0
_________________________________________________________________


In [None]:
x_input = np.array(sen_list)
y_input = np.array(train_data['label'].values)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_input, y_input, test_size=0.2, random_state=1)
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=1, batch_size=10)



<keras.callbacks.History at 0x1b97576d9c0>

In [None]:
test_data = pd.read_csv("train.csv")
#drop last 100 rows from test_data
test_data = test_data.drop(test_data.index[100:])
test_data.fillna(inplace=True, method='pad')

In [None]:
test_sentence_list = pre_process_data(test_data)
test_input = np.array(test_sentence_list)
test_output = model.predict(test_input)

In [None]:
print(test_output)

[[8.46832395e-01]
 [1.08811795e-03]
 [9.45790470e-01]
 [9.52041149e-01]
 [6.79750502e-01]
 [9.27105611e-06]
 [7.69384205e-01]
 [5.33690560e-04]
 [6.02597793e-05]
 [5.02163675e-05]
 [1.83746014e-02]
 [6.06403887e-01]
 [9.42068219e-01]
 [8.57354999e-01]
 [9.24624920e-01]
 [1.15157291e-03]
 [1.30058601e-04]
 [9.64716554e-01]
 [9.79840875e-01]
 [5.61037916e-04]
 [6.02863133e-01]
 [8.38382065e-01]
 [4.36508300e-04]
 [8.70480776e-01]
 [1.59181247e-03]
 [9.06828523e-01]
 [4.15521208e-06]
 [9.48797822e-01]
 [8.39914719e-05]
 [3.52835259e-03]
 [8.50486613e-05]
 [9.23268199e-01]
 [1.28578016e-04]
 [8.94596917e-04]
 [1.91844272e-04]
 [5.99638770e-05]
 [9.52185750e-01]
 [9.63452101e-01]
 [1.35075548e-04]
 [6.64649648e-04]
 [8.51383782e-04]
 [9.68668938e-01]
 [3.93202342e-03]
 [9.53582525e-01]
 [3.46049492e-04]
 [6.96874165e-04]
 [9.57347572e-01]
 [9.76482630e-01]
 [7.91447601e-05]
 [1.46234961e-04]
 [9.56421554e-01]
 [9.35208380e-01]
 [1.13243135e-04]
 [1.13243135e-04]
 [9.72506285e-01]
 [9.712680

In [None]:
submission_df = pd.DataFrame({'id': test_data['id'], 'label': (test_output.flatten().reshape(-1,) > 0.5).astype('int')})

In [None]:
submission_df.to_csv("submission.csv", index=False)

In [None]:
submission_df

Unnamed: 0,id,label
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
...,...,...
95,95,1
96,96,1
97,97,0
98,98,0


In [None]:
import pickle

#save pickle
pickle.dump(model, open('trainedModel/mmodel.sav', 'wb'))



INFO:tensorflow:Assets written to: ram://2b5959db-d259-40f4-963c-4dea153068d5/assets


INFO:tensorflow:Assets written to: ram://2b5959db-d259-40f4-963c-4dea153068d5/assets
