In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

# NLTK 데이터 다운로드 (처음 실행할 때만 필요)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')

# 샘플 텍스트
text = "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language."

# 1. 토큰화
tokens = word_tokenize(text)
print("토큰화:", tokens[:10])

# 2. 불용어 제거
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("불용어 제거 후:", filtered_tokens[:10])

# 3. 어간 추출
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("어간 추출 후:", stemmed_tokens[:10])

# 4. 감성 분석
sid = SentimentIntensityAnalyzer()
sentiment_scores = sid.polarity_scores(text)
print("감성 분석 결과:", sentiment_scores)

# neg - Negative(부정적) / neu - Neutral(중립적) / pos - Positive(긍정적) / compound - Compound Score(종합점수)
# compound : -1(매우 부정적) ~ 1(매우 긍정적)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


토큰화: ['Natural', 'language', 'processing', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer']
불용어 제거 후: ['Natural', 'language', 'processing', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial']
어간 추출 후: ['natur', 'languag', 'process', 'subfield', 'linguist', ',', 'comput', 'scienc', ',', 'artifici']
감성 분석 결과: {'neg': 0.0, 'neu': 0.772, 'pos': 0.228, 'compound': 0.6808}


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# NLTK 데이터 다운로드
# nltk.download('punkt')
# nltk.download('stopwords')

# 샘플 데이터 생성 (실제로는 더 많은 데이터가 필요합니다)
data = {
    'review': [
        "This movie was fantastic! I loved every minute of it.",
        "Worst film I've ever seen. Complete waste of time.",
        "Average movie, nothing special but not terrible.",
        "The acting was great, but the plot was confusing.",
        "I fell asleep halfway through, so boring.",
        "Absolutely brilliant! A must-watch for everyone.",
        "The special effects were amazing, but the story was weak.",
        "I can't believe I paid money to see this garbage.",
        "It was okay, I guess. Wouldn't watch it again though.",
        "A cinematic masterpiece! Truly unforgettable."
    ],
    'sentiment': ['positive', 'negative', 'neutral', 'neutral', 'negative',
                  'positive', 'neutral', 'negative', 'neutral', 'positive']
}

df = pd.DataFrame(data)

# 텍스트 전처리 함수
def preprocess_text(text):
    # 소문자 변환 및 특수문자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # 토큰화
    tokens = word_tokenize(text)
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# 텍스트 전처리 적용
df['processed_review'] = df['review'].apply(preprocess_text)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df['processed_review'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF 벡터화
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# 모델 학습
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

# 예측 및 평가
y_pred = model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred, zero_division=1))

# 새로운 리뷰에 대한 예측
new_reviews = [
    "I really enjoyed this movie, it was entertaining from start to finish.",
    "This film was a complete disappointment. I wouldn't recommend it to anyone.",
    "It was an okay movie, had its moments but nothing extraordinary."
]

new_reviews_processed = [preprocess_text(review) for review in new_reviews]
new_reviews_vectorized = vectorizer.transform(new_reviews_processed)
predictions = model.predict(new_reviews_vectorized)

for review, prediction in zip(new_reviews, predictions):
    print(f"Review: {review}\nPredicted sentiment: {prediction}\n")

              precision    recall  f1-score   support

    negative       1.00      0.00      0.00         1
     neutral       0.50      1.00      0.67         1

    accuracy                           0.50         2
   macro avg       0.75      0.50      0.33         2
weighted avg       0.75      0.50      0.33         2

Review: I really enjoyed this movie, it was entertaining from start to finish.
Predicted sentiment: neutral

Review: This film was a complete disappointment. I wouldn't recommend it to anyone.
Predicted sentiment: neutral

Review: It was an okay movie, had its moments but nothing extraordinary.
Predicted sentiment: neutral

