In [1]:
# RUN

import numpy as np
import pandas as pd
df = pd.read_csv('news.csv')
df = df.drop(df.columns[0],axis=1)
df['news'] = df['title'] + ' ' + df['text']
convert_to_binary = {'REAL':1,'FAKE':0}
df['label'] = df['label'].map(convert_to_binary)
df = df.drop([df.columns[0],df.columns[1]],axis=1)
df = df.reindex(columns=['news','label'])

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

# Creating a list of stopwords (common words which are not important to the overall meaning of the article).
stop_words = stopwords.words('english')
stop_words.extend(['the','it','in'])

# Lemmatization is the process of reducing words to their basic form, e.g. dogs->dog, running->run.
WNL = WordNetLemmatizer()

# We now iterate through each article in the dataset.
for index, row in df.iterrows():
    filtered_article = ''
    article = row['news']
    # Removing all punctuation using RegEx.
    article = re.sub(r'[^\w\s]', '', article)
    # Tokenization: changing the article from one continuous string to a list of tokens (words).
    words = [word.lower() for word in nltk.word_tokenize(article)]
    # Removing stopwords.
    words = [word for word in words if not word in stop_words]
    # Lemmatization and concatenating the words back into a single string.
    words_lemmatized = []
    for word in words:
        if word == 'us':
            words_lemmatized.append(word)
        else:
            words_lemmatized.append(WNL.lemmatize(word))
    filtered_article = " ".join([word for word in words_lemmatized])
    
    # Replace the article with the new filtered version.
    df.loc[index, 'news'] = filtered_article
    
df_input = df['news']
df_output = df['label']

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(df_input)

x = tf_idf_matrix.toarray()
labels = df_output.values

In [2]:
#RUN

# installing and importanting neccessary modules (mainly Keras)

!pip install keras
!pip install tensorflow
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from keras import Model



In [3]:
# RUN

# setting random seed (for consistent results & choosing operating system)

from numpy.random import seed
from tensorflow.random import set_seed
seed(0)
set_seed(0)
import os
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [4]:
# RUN

# 'hot encoding' the data (go from binary 0 or 1 to [1,0] and [0,1] respectively)

from keras.utils import to_categorical
print("example point's binary outcome:", labels[100])
labels = to_categorical(labels)
print("same example point's one hot encoded vector:", labels[100])

example point's binary outcome: 1
same example point's one hot encoded vector: [0. 1.]


In [5]:
# RUN

# splitting the data into a training data set (60%), a validation data set (20%) and a test data set (20%) 

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)

In [6]:
# RUN

# scaling the training and validation data features. Removes mean and makes the datas variance = 1. 
# learns the correct transformation on the training data and then applies to the test data.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [7]:
# RUN

# importing things we need to build a neural network
from keras.models import Sequential # going to use a sequential neural network
from keras.layers import Dense # type of layer
from keras.optimizers import SGD # optimisation technique (stochastic gradient descent) used to update the weights

In [8]:
# RUN

import numpy as np

from hyperopt import Trials, STATUS_OK, tpe
from keras.datasets import mnist
from keras.layers.core import Dense, Dropout, Activation
from keras.models import Sequential
from keras.utils import np_utils

from hyperas import optim
from hyperas.distributions import choice, uniform

In [9]:
# RUN

def create_model(x_train, y_train, x_test, y_test):
    """
    Model providing function:
    
    Create Keras model with double curly brackets dropped-in as needed.
    Return value has to be a valid python dictionary with two customary keys:
        - loss: Specify a numeric evaluation metric to be minimized
        - status: Just use STATUS_OK and see hyperopt documentation if not feasible
    The last one is optional, though recommended, namely:
        - model: specify the model just created so that we can later use it again.
    """
    model = Sequential()
    model.add(Dense({{choice([32, 64, 128,256])}}, input_dim=80967, activation='relu'))
    model.add(Dense({{choice([32, 64, 128,256])}}, activation='relu'))
    model.add(Dense({{choice([16, 32, 64,128])}}, activation='relu'))#,name='encoded')
    model.add(Dense({{choice([8, 16, 32,64])}}, activation='relu'))
    model.add(Dense({{choice([16, 32, 8])}}, activation='relu'))
    model.add(Dense({{choice([32, 64, 16,8])}}, activation='relu'))
    if {{choice(['five', 'seven'])}} == 'five':
        model.add(Dense(80967))
    else:
        model.add(Dense({{choice([32,64])}}, activation='relu'))
        model.add(Dense({{choice([32,64])}}, activation='relu'))
        model.add(Dense(80967))
        # We can also choose between complete sets of layers

    model.compile(loss='mse', metrics=['mse'],
                  optimizer={{choice(['adam', 'sgd'])}}(learning_rate={{uniform(0, 1)}}))

    result = model.fit(x_train, y_train,
              batch_size={{choice([3000, 4500, 4751])}},
              epochs=50,
              validation_split=0.2)
    #get the highest validation accuracy of the training epochs
    validation_mse = np.amax(result.history['val_mse']) 
    print('Best validation acc of epoch:', validation_acc)
    return {'loss': validation_mse, 'status': STATUS_OK, 'model': model}

In [13]:
def data2():
    return x_train, y_train, x_val, y_val

best_run, best_model = optim.minimize(model=create_model,
                                          data=data2,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),
                                     notebook_name ='Autoencoder Notebook')

>>> Imports:
#coding=utf-8

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    import nltk
except:
    pass

try:
    from nltk.tokenize import word_tokenize
except:
    pass

try:
    from nltk.corpus import stopwords
except:
    pass

try:
    from nltk.stem import WordNetLemmatizer
except:
    pass

try:
    import re
except:
    pass

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
except:
    pass

try:
    import tensorflow as tf
except:
    pass

try:
    import numpy as np
except:
    pass

try:
    import matplotlib.pyplot as plt
except:
    pass

try:
    from keras import Model
except:
    pass

try:
    from numpy.random import seed
except:
    pass

try:
    from tensorflow.random import set_seed
except:
    pass

try:
    import os
except:
    pass

try:
    from keras.utils import to_categorical
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
   

job exception: 'str' object is not callable



  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]


TypeError: 'str' object is not callable