In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [None]:
import random
import os
from tensorflow.python.eager import context
from tensorflow.python.client import device_lib


# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available\n")
    print("CuDNN is enabled: True\n")
else:
    print("GPU is not available\n")


print(tf.config.list_physical_devices('GPU'), "\n")
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')), "\n")
print("GPU Name: ", tf.test.gpu_device_name(), "\n")


def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

print("Available devices: ", get_available_devices(), "\n")

devices_available = get_available_devices()
print(devices_available[1])

In [None]:
# Detect hardware, return appropriate distribution strategy
with tf.device(devices_available[1]):
    tf.debugging.set_log_device_placement(True)
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()

    print("Num of replicas: ", strategy.num_replicas_in_sync)
#tf.distribute.get_strategy().num_replicas_in_sync equals 1,
# it means that the training is not distributed and is being performed on a single device.

In [None]:
train      = pd.read_csv("A:/Programming/Mini Project/College projects/S6 mini project/Transformers_study/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
validation = pd.read_csv("A:/Programming/Mini Project/College projects/S6 mini project/Transformers_study/jigsaw-multilingual-toxic-comment-classification/validation.csv")
test       = pd.read_csv("A:/Programming/Mini Project/College projects/S6 mini project/Transformers_study/jigsaw-multilingual-toxic-comment-classification/test.csv")

In [None]:
for i in [train, validation, test]:
    shape = i.shape
    ncols = shape[1]
    nrows = shape[0]
    print("Number of columns: ", ncols)
    for j in i.columns:
        print(j, end=", ")
    print("\nNumber of rows: ", nrows)

In [None]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [None]:
train = train.loc[:12000,:]
train.shape

In [None]:
#Max number of words that is possible in column comment_text
train['comment_text'].apply(lambda x: len(str(x).split())).max()

In [None]:
def roc_auc(predictions, target):
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)