In [1]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

NLTK, re를 이용한 데이터 전처리

In [2]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Convert to lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Remove punctuation and non-alphabetic characters
    tokens = [token for token in tokens if token.isalpha()]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [3]:
import pandas as pd

# Define the path to the input CSV file and the output CSV file
input_file = 'bing_news.csv'
output_file = 'bing_news_preprocessed.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(input_file)

# Preprocess the text columns using preprocess_text() function
data['name'] = data['name'].apply(preprocess_text)
data['description'] = data['description'].apply(preprocess_text)

# Save the preprocessed data to a new CSV file
data.to_csv(output_file, index=False)
