# 1) Import Packages

As usual we start loading the packages that we will use in our notebook

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder 


In [2]:
#PRINT VERSION!!
tf.__version__

'2.2.0'

## 2) Import Dataset

In [3]:
!wget -O train.csv https://github.com/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/blob/master/section_4_notebooks/train.csv?raw=true

--2020-06-01 11:53:42--  https://github.com/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/blob/master/section_4_notebooks/train.csv?raw=true
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/raw/master/section_4_notebooks/train.csv [following]
--2020-06-01 11:53:42--  https://github.com/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/raw/master/section_4_notebooks/train.csv
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://media.githubusercontent.com/media/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/master/section_4_notebooks/train.csv [following]
--2020-06-01 11:53:43--  https://media.githubusercontent.com/media/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/master/sectio

## 3) Explore Dataset

In [4]:
train_df = pd.read_csv("train.csv")#here we have the dataset we extracted
print("Dataset length: ", len(train_df))
train_df.head(n=10)

Dataset length:  20800


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


In [0]:
Y = train_df['label']

# 4) Preprocessing Dataset

In [6]:
len(train_df['author'].unique()) #we print the length, not a big one but sufficient

4202

In [0]:
train_df['title_lower'] = train_df["title"].str.lower()
train_df['title_no_punctuation'] = train_df['title_lower'].str.replace('[^\w\s]','')
train_df['title_no_punctuation'] = train_df["title_no_punctuation"].fillna("fillna")

In [0]:
train_df['text_lower'] = train_df["text"].str.lower()
train_df['text_no_punctuation'] = train_df['text_lower'].str.replace('[^\w\s]','')
train_df['text_no_punctuation'] = train_df["text_no_punctuation"].fillna("fillna")

In [0]:
train_df['author_lower'] = train_df["author"].str.lower()
train_df['author_no_spaces'] = train_df['author_lower'].str.replace(' ','_')


In [10]:
train_df['author_no_spaces'].head() #in this way we can treat each author as a word.

0         darrell_lucus
1       daniel_j._flynn
2    consortiumnews.com
3       jessica_purkiss
4        howard_portnoy
Name: author_no_spaces, dtype: object

In [0]:
max_features=5000 #we set maximum number of words to 5000
maxlen=400 #we set maximum sequence length to 400

In [0]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features) #again tokenizer step

In [0]:
tok.fit_on_texts(list(train_df['text_no_punctuation'])+list(train_df['title_no_punctuation'])+list(train_df['author_no_spaces'].astype(str))) #fit to cleaned text


In [14]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

216068


# 5) Prepeare Sequences

In [0]:
text_df = tok.texts_to_sequences(list(train_df['text_no_punctuation'])) #this is how we create sequences
text_df = tf.keras.preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) #let's execute pad step

In [0]:
title_df = tok.texts_to_sequences(list(train_df['title_no_punctuation'])) #this is how we create sequences
title_df = tf.keras.preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

In [0]:
author_df = tok.texts_to_sequences(list(train_df['author_no_spaces'].astype(str))) #this is how we create sequences
author_df = tf.keras.preprocessing.sequence.pad_sequences(author_df, maxlen=maxlen)

In [0]:
train_df = author_df #title_df + text_df  

In [0]:
from sklearn.model_selection import train_test_split #divide into train and test set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=42)

In [0]:
embedding_dim = 50 #this is the final dimension of the embedding space.


# 6) Model Training

In [0]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size, #embedding input
                           output_dim=embedding_dim,#embedding output
                           input_length=maxlen), #maximum length of an input sequence
  tf.keras.layers.Flatten(), #flatten layer

  tf.keras.layers.Dense(1, activation=tf.nn.sigmoid) #no more softmax

])

In [0]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',#no more categorical_crossentropy
              metrics=['accuracy'])



In [24]:
model.summary() #here we show the architecture 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 50)           10803450  
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 20001     
Total params: 10,823,451
Trainable params: 10,823,451
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.fit(np.array(X_train), np.array(y_train), epochs=1) #let's fit the model



<tensorflow.python.keras.callbacks.History at 0x7ff2f69e8a90>

# 7) Model evaluation

In [26]:
#results text, title and author [0.1477214220767984, 0.9447115384615384]
#results text, title [0.13193302869510193, 0.9461538461538461]
#results author [0.3442320129046073, 0.8211538461538461]
model.evaluate(np.array(X_test), np.array(y_test)) 



[0.3674405813217163, 0.8125]