<a href="https://colab.research.google.com/github/Annie-Yeeun-Jang/Yeeun-J/blob/master/text_conf/make_data_by_discriminator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

# Train

In [None]:
import pandas as pd
import torch
import os
import matplotlib.pyplot as plt

# Preliminaries

from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
#source_folder = '/content/drive/My Drive/transformers/Data'


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
destination_folder = '/content/drive/Shareddrives/text_conf/discriminator/check_point/'

In [None]:
MAX_SEQ_LEN = 128
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode,is_target = True, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)


In [None]:
class BERT(nn.Module):

    def __init__(self):
        super(BERT, self).__init__()

        options_name = "bert-base-uncased"
        self.encoder = BertForSequenceClassification.from_pretrained(options_name)

    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]

        return loss, text_fea

In [None]:
model = BERT().to(device)
state_dict = torch.load(destination_folder + 'model.pt', map_location=device)
model.load_state_dict(state_dict['model_state_dict'])

In [None]:
tabular = TabularDataset(path="/content/drive/Shareddrives/text_conf/dataset/preprocessed/discriminator/with_academic_len32.csv", format='CSV', fields=[('text', text_field)], skip_header=True)
# Iterators
data_iter = Iterator(tabular, batch_size=16, device=device, train=False, shuffle=False, sort=False)

In [None]:
tokenizer.decode(next(iter(data_iter)).text[0])

In [None]:
original = pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/preprocessed/discriminator/with_academic_len32.csv").text.tolist()
original_batch = []
for batch in range(len(original)//16 + 1):
  original_batch.append(original[batch:(1+batch)*16])

In [None]:
threshold = 0.9
softmax = nn.Softmax(dim = 1)

In [None]:
with torch.no_grad():
  model.eval()
  native = []
  nonnative = []
  for (i, (iter)), original_text in zip(enumerate(data_iter), original_batch):
    text = iter.text.type(torch.LongTensor).to(device)
    label = torch.tensor([0]*len(text), dtype = torch.long).to(device)
    _, tmp = model(text, label)

    result = tmp.to('cpu')

    softmax_dist = softmax(result)
    argmax_idx = torch.argmax(softmax_dist, dim = 1)
    confidence_list = [dist[idx] for dist, idx in zip(softmax_dist, argmax_idx)]
    
    native_text = [line for line, idx, confidence in zip(original_text, argmax_idx, confidence_list) if idx == 0 and confidence > threshold]
    nonnative_text = [line for line, idx, confidence in zip(original_text, argmax_idx, confidence_list) if idx == 1 and confidence > threshold]

    native.extend(native_text)
    nonnative.extend(nonnative_text)

    if i % 300 == 0:
      print(f"{i}/{len(data_iter)} 배치 완료")

In [None]:
print(len(native), len(nonnative))

In [None]:
nonnative[-10:]

# train test val 나눠서 저장하기

In [None]:
native_idx = round(len(native)*0.9)
nonnative_idx = round(len(nonnative)*0.9)

In [None]:
import random
random.shuffle(native)
random.shuffle(nonnative)

native_train = native[:native_idx]
native_test = native[native_idx:]

nonnative_train = nonnative[:nonnative_idx]
nonnative_test = nonnative[nonnative_idx:]

In [None]:
native_train_pd = pd.DataFrame(native_train)
native_test_pd = pd.DataFrame(native_test)

nonnative_train_pd = pd.DataFrame(nonnative_train)
nonnative_test_pd = pd.DataFrame(nonnative_test)

native_train_pd.to_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_train2.csv", header=None, index = None)
native_test_pd.to_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_test2.csv", header=None, index = None)

nonnative_train_pd.to_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_train2.csv", header=None, index = None)
nonnative_test_pd.to_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_test2.csv", header=None, index = None)

# 문장 길이 짧은거 없애기

In [None]:
pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_train2.csv").text.to_list()

In [9]:
import pandas as pd
native_train = pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_train2.csv", header= None)
native_test = pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_test2.csv", header= None)

nonnative_train = pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_train2.csv", header= None)
nonnative_test = pd.read_csv("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_test2.csv", header= None)

In [41]:
with open("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_train2.csv", "r") as f:
    native_train= f.readlines()
with open("/content/drive/Shareddrives/text_conf/dataset/styletransformer/native_test2.csv", "r") as f:
    native_test= f.readlines()

with open("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_train2.csv", "r") as f:
    nonnative_train= f.readlines()
with open("/content/drive/Shareddrives/text_conf/dataset/styletransformer/nonnative_test2.csv", "r") as f:
    nonnative_test= f.readlines()
    

In [42]:
native_train = list(filter(lambda x: len(x)>20, native_train))
native_test = list(filter(lambda x: len(x)<20, native_test))
nonnative_train = list(filter(lambda x: len(x)<20, nonnative_train))
nonnative_test = list(filter(lambda x: len(x)<30, nonnative_test))

In [43]:
native_test

["One of Hank Jr . '\n"]

In [44]:
nonnative_test

['They will not be stifled .\n',
 '"Again , as Boggess et al."\n',
 'No one had a glib answer .\n',
 'It was painful for her .\n',
 '"Again , as Boggess et al."\n',
 'She was far away from home .\n',
 'The very presence of U.N .\n',
 'The answer is probably yes .\n',
 'NUM  million out of  NUM .\n',
 'I told him it was too late .\n',
 'The new prize is Asia .\n',
 'NUM  million out of  NUM .\n',
 'NUM  billion for the year .\n',
 '"Who will want her ? """\n',
 'NUM  billion for the year .\n',
 'No one had a glib answer .\n',
 'Birth then was a ceremony .\n',
 "do n't over speak ; .\n",
 'NUM  million out of  NUM .\n',
 '", Stony Point , NY  NUM  ."\n',
 '"NUM   NUM  , p = ."\n',
 'NUM  billion for the year .\n',
 'That is the first problem .\n',
 'That is the first problem .\n',
 '"Again , as Boggess et al."\n',
 'That is the first problem .\n',
 'The new prize is Asia .\n',
 'And now they are thriving .\n',
 'I told him it was too late .\n',
 'Where does it come from ?\n',
 'NUM  bill

In [None]:
native_train