#### Library

In [1]:
import re
import numpy as np
import pandas as pd
import torch
import torchtext
import matplotlib.pyplot as plt
import random
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from torchtext.data import get_tokenizer

In [2]:
# seed 고정
def random_seed(seed_num):
    torch.manual_seed(seed_num)
    torch.cuda.manual_seed(seed_num)
    torch.cuda.manual_seed_all(seed_num)
    np.random.seed(seed_num)
    cudnn.benchmark = False
    cudnn.deterministic = True
    random.seed(seed_num)

random_seed(42)

#### Data Load

In [3]:
df = pd.read_csv("../data/Medium/medium_data.csv")
df = df['title'].values # title 만 사용
df

array(['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model',
       'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric',
       'How to Use ggplot2 in\xa0Python', ...,
       'Content and Marketing Beyond Mass Consumption',
       '5 Questions All Copywriters Should Ask Clients Before Their Pen Hits the\xa0Paper',
       'How To Write a Good Business Blog\xa0Post'], dtype=object)

In [4]:
def cleaning_text(text):
    cleaned_text = re.sub( r"[^a-zA-Z0-9.,@#!\s']+", "", text) # 특수문자 를 모두 지우는 작업을 수행
    cleaned_text = cleaned_text.replace(u'\xa0',u' ') # No-break space를 unicode 빈칸으로 변환
    cleaned_text = cleaned_text.replace('\u200a',' ') # unicode 빈칸을 빈칸으로 변환
    return cleaned_text

cleaned_data = list(map(cleaning_text, df)) # 모든 특수문자와 공백을 지움
print('Before preprocessing')
print(df[:5])
print('After preprocessing')
print(cleaned_data[:5])

Before preprocessing
['A Beginner’s Guide to Word Embedding with Gensim Word2Vec\xa0Model'
 'Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric'
 'How to Use ggplot2 in\xa0Python'
 'Databricks: How to Save Files in CSV on Your Local\xa0Computer'
 'A Step-by-Step Implementation of Gradient Descent and Backpropagation']
After preprocessing
['A Beginners Guide to Word Embedding with Gensim Word2Vec Model', 'Handson Graph Neural Networks with PyTorch  PyTorch Geometric', 'How to Use ggplot2 in Python', 'Databricks How to Save Files in CSV on Your Local Computer', 'A StepbyStep Implementation of Gradient Descent and Backpropagation']


#### Tokenizer

In [None]:
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(cleaned_data[0])
print(f"Original text: {cleaned_data[0]}") # 원본
print(f"Token: {tokens}") # tokenizer 후

Original text: A Beginners Guide to Word Embedding with Gensim Word2Vec Model
Token: ['a', 'beginners', 'guide', 'to', 'word', 'embedding', 'with', 'gensim', 'word2vec', 'model']
