In [None]:
!pip install transformers==4.3.3
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from transformers import AutoModel, AutoTokenizer 
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import string
import tqdm
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from wordcloud import WordCloud
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
raw_df = pd.read_csv('train_unbalanced_final.csv')
raw_df.head()

In [None]:
print(raw_df.shape)
df = raw_df[['content', 'Pro Trump', 'Pro Biden', 'Neutral']]
df = df[df['Pro Biden'].notna()]
df = df[df['content'].notna()]
df = df[df['Pro Trump'].notna()]
df = df[df['Neutral'].notna()]
print(df.shape)
df = df.astype({"Pro Trump": int, "Pro Biden": int, "Neutral": int})
df.drop_duplicates(subset='content', keep='first', inplace=True)
print(df.shape)
df.head()

In [None]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [None]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

In [None]:
labels = list(df.one_hot_labels.values)
comments = list(df.content.values)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from tensorflow import keras
import tensorflow_hub as hub
# Recreate the exact same model, including its weights and the optimizer
sarcasm_model = keras.models.load_model('/content/gdrive/MyDrive/Info Retrieval/Sarcasm Detection/model.h5',custom_objects={'KerasLayer':hub.KerasLayer})

# Show the model architecture
sarcasm_model.summary()

In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens, all_masks, all_segments = [], [], []
    
    for text in tqdm(texts):
        # Tokenize the current text
        text = tokenizer.tokenize(text)
        # Select text only till 
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
%%time
url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(url, trainable=True)

In [None]:
# Get tokenizer
vocab_fl = bert_layer.resolved_object.vocab_file.asset_path.numpy()
lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_fl, lower_case)

In [None]:
train_input = bert_encode(df['content'].values, tokenizer, max_len=160)

In [None]:
df.head()

In [None]:
predictions = sarcasm_model.predict(train_input)
print(predictions)

In [None]:
# Generate arg maxes for predictions
classes = np.argmax(predictions, axis = 1)
print(classes)

In [None]:
df['sarcasm_labels'] = classes

In [None]:
def encode_target(t_class):
    t_class=str(t_class)
    class_dict = {
        '0':'sarcasm',
        '1':'sarcasm',
        '2':'regular',
        '3':'sarcasm'
    }
    return class_dict[t_class]

In [None]:
df["sarcasm_labels"] = df['sarcasm_labels'].apply(lambda x: encode_target(x))

In [None]:
df.head()

In [None]:
count = df['sarcasm_labels'].value_counts()
print(count)

In [None]:
df.to_csv('train_with_sarcasm.csv')

In [None]:
df_sarcasm = df[(df['sarcasm_labels'] == 'sarcasm')]
df_sarcasm.head()

In [None]:
df_sarcasm.to_csv('sarcasm_only.csv')