In [1]:
import os
import tempfile
# For N-dimensional array manipulation
import numpy as np
# Plotting library
import matplotlib.pyplot as plt
# For data analysis and data structures in DataFrames
import pandas as pd
# For data visualization
import seaborn as sns

# For machine learning algorithms and evaluation metrics
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics

#import tensorflow
import tensorflow as tf
from tensorflow import keras
from keras import layers
# import TextVectorization from keras
from keras.layers import TextVectorization

In [2]:

# Load dataset
df = pd.read_csv('../../dataset/media_bias.csv')

# Clean dataset
df = df[df.Label_bias != 'No agreement']
df = df[df.article != 'NaN']
df = df[df.sentence != 'NaN']

# Replace label with 0, 1
df['Label_bias'] = df['Label_bias'].replace('Biased', 1)
df['Label_bias'] = df['Label_bias'].replace('Non-biased', 0)

# Only use sentence column and bias column
df = df[['sentence', 'Label_bias']]
df = df.rename(columns={'sentence': 'text', 'Label_bias': 'label'})

# Split data into X and y
X = df['text']
y = df['label']

# Remove numbers from all strings in X
X = X.str.replace('\d+', '', regex=True)

# Remove punctuation from all strings in X
X = X.str.replace('[^\w\s]','',regex=True)

In [3]:
# Split data into train, validation and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Print shape of train, validation and test
print("X_train shape: ", X_train.shape)
# print("X_val shape: ", X_val.shape)
print("X_test shape: ", X_test.shape)


# Flatten X_train for training and X_test for testing
X_train = np.array(X_train).flatten()
X_test = np.array(X_test).flatten()
    
# X_train = X_train.to_numpy()
# X_train = np.array(X_train).flatten()
# print x shape
print("X_train shape: ", X_train.shape)
# print test shape
print("X_test shape: ", X_test.shape)


X_train shape:  (992,)
X_test shape:  (311,)
X_train shape:  (992,)
X_test shape:  (311,)


In [18]:
print(X_train.shape[0])
# Create DNN using tensorflow
vectorize_layer = TextVectorization(max_tokens=512, output_mode='int', output_sequence_length=128)
vectorize_layer.adapt(X_train)
model = keras.Sequential([
    vectorize_layer,
    layers.Embedding(input_dim=X_train.shape[0] , output_dim=128, mask_zero=True),
    layers.Bidirectional(layers.LSTM(128, return_sequences=True)),
    layers.Flatten(),
    layers.Dropout(0.4),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])
model.compile(loss='sparese_catageorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model
history = model.fit(X_train, y_train, epochs=4, batch_size=16, validation_data=(X_val, y_val))

992
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_9 (TextV  (None, 128)              0         
 ectorization)                                                   
                                                                 
 embedding_7 (Embedding)     (None, 128, 128)          126976    
                                                                 
 bidirectional_8 (Bidirectio  (None, 128, 256)         263168    
 nal)                                                            
                                                                 
 flatten_7 (Flatten)         (None, 32768)             0         
                                                                 
 dropout_9 (Dropout)         (None, 32768)             0         
                                                                 
 dense_16 (Dense)            (None, 128)          

In [20]:

# model.save('model/saved_model')
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

prediction = model.predict(["YouTube is making clear there will be no “birtherism” on its platform during this year’s U.S. presidential election – a belated response to a type of conspiracy theory more prevalent in the 2012 race.", "The increasingly bitter dispute between American women’s national soccer team and the U.S. Soccer Federation spilled onto the field Wednesday night when players wore their warm-up jerseys inside outin a protest before their 3-1 victory over Japan."])
print(prediction, "1 is bias, 0 is non-bias")


Loss:  1.0383291244506836
Accuracy:  0.6366559267044067
[[0.1754145 ]
 [0.00295903]] 1 is bias, 0 is non-bias


In [14]:
parent_dir = os.path.split(os.getcwd())[0] + "\\" + os.path.split(os.getcwd())[1]
save_path = parent_dir + "/model/1/"
# tf.saved_model.save(model, save_path) - DOESN'T SAVE THE LAYERS

model.save(save_path, save_format='tf') # ERROR states layers aren't saved, but keras_metadata.pb is saved



INFO:tensorflow:Assets written to: c:\Users\verni\Desktop\AI course\group-07\notebooks\toy_model/model/1/assets


INFO:tensorflow:Assets written to: c:\Users\verni\Desktop\AI course\group-07\notebooks\toy_model/model/1/assets


In [15]:

# def _serving_input_receiver_fn():
#     serialized_tf_example = tf.placeholder(dtype=tf.string, shape=None, 
#                                            name='input_example_tensor')
#     # key (e.g. 'examples') should be same with the inputKey when you 
#     # buid the request for prediction
#     receiver_tensors = {'examples': serialized_tf_example}
#     inputs = {'text': tf.placeholder(tf.string, [None])}
#     return tf.estimator.export.ServingInputReceiver(inputs, receiver_tensors)

In [16]:
from google.cloud import aiplatform
from google.cloud import storage
import os
project_id = 'dit825'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
storage_client = storage.Client(project=project_id)
buckets = storage_client.list_buckets()
print("Buckets:")
for bucket in buckets:
    print(bucket.name) 
print("Listed all storage buckets.")
# List all models in the project from aiplatform
aiplatform.init(project=project_id, location='europe-west4')
models = aiplatform.Model.list()
print("Models:")
for model in models:
    print(model)
print("Listed all models.")


Buckets:
example_bucket_v1
example_bucket_v2-aiproject-dit825
Listed all storage buckets.
Models:
Listed all models.


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train)

train = vectorizer.transform(X_train)
test  = vectorizer.transform(X_test)

classifier = LogisticRegression()
classifier.fit(train, y_train)
score = classifier.score(test, y_test)

In [10]:
print("Accuracy:", score)

Accuracy: 0.7106109324758842
