***
<h1 id="heading"><center><span style="background-color:#5642C5; color:white ; display:fill;border-radius:5px; font-family:cursive"> 4. Deep Learning modeling ⚙️🛠️ </span></center><a class="anchor-link"></a></h1>
<p><center style="color:#159364; font-family:cursive;">The best way to make yourself extremely valuable in a team is to understand everything, but being a master of something.</center></p>

***


In [1]:
TRAIN_PATH = "../data/processed/1_train_preprocessed_df.pkl"
TEST_PATH = "../data/processed/1_test_preprocessed_df.pkl"


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "fake_news_predection"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

***
<h3 id="heading"><span style="background-color:#cefffb; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 📥 Import packages & data </span></h3>

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path
# Load Mlflow experiment tracker
# if MlFlow not working try 
# pip install pydantic==1.10.9
import mlflow
from mlflow.tracking import MlflowClient

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Load metrics
from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report

# Load ml models
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Dense,LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot


from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore")

2023-09-05 11:35:22.790807: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-05 11:35:23.407549: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-05 11:35:23.410135: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def calculate_quality(ground_truth, predictions, metric_function):
    quality_score = round(metric_function(ground_truth, predictions) * 100, 2)
    return quality_score

***
<h3 id="heading"><span style="background-color:#cefffb; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗄️ Initialize </span></h3>

<h4 id="heading"><span font-family:Georgia"><Strong>📑 Create directories </Strong></span></h4>


In [4]:
# Create directories if they don't exist
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

<h4 id="heading"><span font-family:Georgia"><Strong>📑 Read data </Strong></span></h4>


In [5]:
# Read Data
train_df = pd.read_pickle(TRAIN_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [10]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train , train_size=0.8, random_state=0)

***
<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🏋️‍♂️ Train models </span></h2>

In [11]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = mlflow.MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

In [12]:
client.get_experiment_by_name

<bound method MlflowClient.get_experiment_by_name of <mlflow.tracking.client.MlflowClient object at 0x7f229474d650>>

***
<h3 id="heading"><span style="background-color:#FCF3AD; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🧠 1. LSTM </span></h3>

***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ training </span></h4>

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Prepare the data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the vocabulary size
tokenizer.fit_on_texts(train_df['content'])
X = tokenizer.texts_to_sequences(train_df['content'])
X = pad_sequences(X, maxlen=100)  # Adjust maxlen as needed

Y = train_df['label'].values

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8, random_state=0)

# Create the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=64)

# Test the model on the test set
test_sequences = tokenizer.texts_to_sequences(test_df['content'])
X_test = pad_sequences(test_sequences, maxlen=100)

predictions = model.predict(X_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
class_labels = (predictions > 0.5).astype(int)

In [13]:
# Evaluate on validation set
validation_scores = {score.__name__: calculate_quality(test_df['label'], class_labels, score) 
                     for score in [accuracy_score, precision_score, recall_score, f1_score]}

# Create a DataFrame from the dictionary
test_scores_df = pd.DataFrame.from_dict(validation_scores, orient='index', columns=['Validation Score']).T
print('The confusion matrix is:\n ',confusion_matrix(test_df['label'], class_labels))
# Display the DataFrame
test_scores_df

The confusion matrix is:
  [[1414  925]
 [1086 1775]]


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Validation Score,61.33,65.74,62.04,63.84


***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ Log run </span></h4>

<h5 id="heading"><span font-family:Georgia"><Strong>📑 1. Prepare </Strong></span></h5>


In [14]:
# Data details
data_details = {"data_train_path": TRAIN_PATH,
                "data_test_path": TEST_PATH,
                "training_victorize": X_train,
                "test_victorize":     X_test
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [15]:
# Model
model = {"model_description": "tensorFlow LSTM",
         "model_details": str(model),
         "model_object": model} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [16]:
# Hyperparametres
param = {"num_words": 5000,
         "Tokenizer" :'tf.Tokenizer()',
         "loss": 'binary_crossentropy',
         "optimizer": 'adam'
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(param, output_file)

In [17]:
# Performance details
classes_metrics = {"test_scores":  test_scores_df}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

<h5 id="heading"><span font-family:Georgia"><Strong>📑 2. Log </Strong></span></h5>


In [21]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Log parametres 
    mlflow.log_param(LOG_PATH)
    
    # Track metrics 
    for metric, score in test_scores_df.items():
        mlflow.log_metric(metric, score) 
    

***
<h3 id="heading"><span style="background-color:#FCF3AD; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🧠 2. expRNN </span></h3>

***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ training </span></h4>

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Prepare the data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the vocabulary size
tokenizer.fit_on_texts(train_df['content'])
X = tokenizer.texts_to_sequences(train_df['content'])
X = pad_sequences(X, maxlen=100)  # Adjust maxlen as needed

Y = train_df['label'].values

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8, random_state=0)

# Create the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=64)

# Test the model on the test set
test_sequences = tokenizer.texts_to_sequences(test_df['content'])
X_test = pad_sequences(test_sequences, maxlen=100)

predictions = model.predict(X_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
class_labels = (predictions > 0.5).astype(int)

In [13]:
# Evaluate on validation set
validation_scores = {score.__name__: calculate_quality(test_df['label'], class_labels, score) 
                     for score in [accuracy_score, precision_score, recall_score, f1_score]}

# Create a DataFrame from the dictionary
test_scores_df = pd.DataFrame.from_dict(validation_scores, orient='index', columns=['Validation Score']).T
print('The confusion matrix is:\n ',confusion_matrix(test_df['label'], class_labels))
# Display the DataFrame
test_scores_df

The confusion matrix is:
  [[1414  925]
 [1086 1775]]


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Validation Score,61.33,65.74,62.04,63.84


***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ Log run </span></h4>

<h5 id="heading"><span font-family:Georgia"><Strong>📑 1. Prepare </Strong></span></h5>


In [14]:
# Data details
data_details = {"data_train_path": TRAIN_PATH,
                "data_test_path": TEST_PATH,
                "training_victorize": X_train,
                "test_victorize":     X_test
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [15]:
# Model
model = {"model_description": "tensorFlow LSTM",
         "model_details": str(model),
         "model_object": model} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [16]:
# Hyperparametres
param = {"num_words": 5000,
         "Tokenizer" :'tf.Tokenizer()',
         "loss": 'binary_crossentropy',
         "optimizer": 'adam'
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(param, output_file)

In [17]:
# Performance details
classes_metrics = {"test_scores":  test_scores_df}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

<h5 id="heading"><span font-family:Georgia"><Strong>📑 2. Log </Strong></span></h5>


In [21]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Log parametres 
    mlflow.log_param(LOG_PATH)
    
    # Track metrics 
    for metric, score in test_scores_df.items():
        mlflow.log_metric(metric, score) 
    

***
<h3 id="heading"><span style="background-color:#FCF3AD; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🧠 3. Transformers </span></h3>

***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ training </span></h4>

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split

# Prepare the data
tokenizer = Tokenizer(num_words=5000)  # You can adjust the vocabulary size
tokenizer.fit_on_texts(train_df['content'])
X = tokenizer.texts_to_sequences(train_df['content'])
X = pad_sequences(X, maxlen=100)  # Adjust maxlen as needed

Y = train_df['label'].values

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.8, random_state=0)

# Create the model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=5, batch_size=64)

# Test the model on the test set
test_sequences = tokenizer.texts_to_sequences(test_df['content'])
X_test = pad_sequences(test_sequences, maxlen=100)

predictions = model.predict(X_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
class_labels = (predictions > 0.5).astype(int)

In [13]:
# Evaluate on validation set
validation_scores = {score.__name__: calculate_quality(test_df['label'], class_labels, score) 
                     for score in [accuracy_score, precision_score, recall_score, f1_score]}

# Create a DataFrame from the dictionary
test_scores_df = pd.DataFrame.from_dict(validation_scores, orient='index', columns=['Validation Score']).T
print('The confusion matrix is:\n ',confusion_matrix(test_df['label'], class_labels))
# Display the DataFrame
test_scores_df

The confusion matrix is:
  [[1414  925]
 [1086 1775]]


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Validation Score,61.33,65.74,62.04,63.84


***
<h4 id="heading"><span style="background-color:#F8D7FB; color:#1b4946 ; display:fill;border-radius:5px; font-family:cursive"> 🗃️ Log run </span></h4>

<h5 id="heading"><span font-family:Georgia"><Strong>📑 1. Prepare </Strong></span></h5>


In [14]:
# Data details
data_details = {"data_train_path": TRAIN_PATH,
                "data_test_path": TEST_PATH,
                "training_victorize": X_train,
                "test_victorize":     X_test
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [15]:
# Model
model = {"model_description": "tensorFlow LSTM",
         "model_details": str(model),
         "model_object": model} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [16]:
# Hyperparametres
param = {"num_words": 5000,
         "Tokenizer" :'tf.Tokenizer()',
         "loss": 'binary_crossentropy',
         "optimizer": 'adam'
               }

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(param, output_file)

In [17]:
# Performance details
classes_metrics = {"test_scores":  test_scores_df}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

<h5 id="heading"><span font-family:Georgia"><Strong>📑 2. Log </Strong></span></h5>


In [21]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Log parametres 
    mlflow.log_param(LOG_PATH)
    
    # Track metrics 
    for metric, score in test_scores_df.items():
        mlflow.log_metric(metric, score) 
    