# 라이브러리 import

In [1]:
import tensorflow
import matplotlib
import seaborn 
import numpy 
import pandas
import sklearn

print(tensorflow.__version__)
print(matplotlib.__version__)
print(seaborn.__version__)
print(numpy.__version__)
print(pandas.__version__)
print(sklearn.__version__)

2024-12-31 15:38:35.376205: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-31 15:38:35.453980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735627115.491765   24379 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735627115.507215   24379 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-31 15:38:35.588111: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

2.18.0
3.9.2
0.13.2
1.26.4
2.2.3
1.5.2


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# 로이터 뉴스 데이터 확인

In [3]:
from tensorflow.keras.datasets import reuters

## 데이터 전처리 함수 

In [4]:
def data_preprocessing(num_words):
    # 데이터 로드
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)

    # 단어장 생성 및 단어 인덱스 생성
    word_index = reuters.get_word_index()
    index_to_word = {index + 3: word for word, index in word_index.items()}
    for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
        index_to_word[index]=token

    # 디코딩 (텍스트로 변환) - train
    decoded_train = []
    for seq in x_train:
        t = ' '.join([index_to_word[index] for index in seq]) # index는 seq의 각 요소(정수 인덱스), index_to_word 정수 인덱스와 단어를 매핑한 딕셔너리
        decoded_train.append(t)
    x_train = decoded_train

    # 디코딩 (텍스트로 변환) - test (리스트로 한 번 나타내보기)
    decoded_test = [' '.join([index_to_word[index] for index in seq]) for seq in x_test]
    x_test = decoded_test
    
    # DTM 생성
    vectorizer = CountVectorizer()
    x_train_dtm = vectorizer.fit_transform(x_train)
    x_test_dtm = vectorizer.transform(x_test)

    # TF-IDF 변환
    tfidf_transformer = TfidfTransformer()
    x_train_tfidf = tfidf_transformer.fit_transform(x_train_dtm)
    x_test_tfidf = tfidf_transformer.transform(x_test_dtm)

    return x_train_tfidf, x_test_tfidf, y_train, y_test


## 모델 및 평가 함수

In [5]:
def train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test):
    
    models = {
        "MultinomialNB": MultinomialNB(),
        "ComplementNB": ComplementNB(),
        "LogisticRegression": LogisticRegression(C=10000, penalty='l2', max_iter=3000),
        "LinearSVC": LinearSVC(C=1000, penalty='l2', max_iter=3000),
        "DecisionTree": DecisionTreeClassifier(max_depth=10, random_state=0),
        "RandomForest": RandomForestClassifier(n_estimators=5, random_state=0),
        "GradientBoosting": GradientBoostingClassifier(random_state=0),
        "VotingClassifier": VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(penalty='l2', random_state=0)),
                ("cnb", ComplementNB()),
                ("gbc", GradientBoostingClassifier(random_state=0))
            ],
            voting='soft'
        )
    }

    results = []
    for model_name, model in models.items():
        model.fit(x_train_tfidf, y_train)
        predicted = model.predict(x_test_tfidf)
        accuracy = accuracy_score(y_test, predicted)
        f1 = f1_score(y_test, predicted, average='weighted')
        # conf_matrix = confusion_matrix(y_test, predicted)
        # class_report = classification_report(y_test, predicted,zero_division=0)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "F1 Score": f1,
            # "Confusion Matrix": conf_matrix,
            # "Classification Report": class_report
        })
        
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        # print(f"Confusion Matrix:\n{conf_matrix}")
        # print(f"Classification Report:\n{class_report}\n")

    return results

- 보기 쉽게 다시 모델 및 평가 선언

In [8]:
def train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test):
    
    models = {
        "MultinomialNB": MultinomialNB(),
        "ComplementNB": ComplementNB(),
        "LogisticRegression": LogisticRegression(C=10000, penalty='l2', max_iter=3000),
        "LinearSVC": LinearSVC(C=1000, penalty='l2', max_iter=3000),
        "DecisionTree": DecisionTreeClassifier(max_depth=10, random_state=0),
        "RandomForest": RandomForestClassifier(n_estimators=5, random_state=0),
        "GradientBoosting": GradientBoostingClassifier(random_state=0),
        "VotingClassifier": VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(penalty='l2', random_state=0)),
                ("cnb", ComplementNB()),
                ("gbc", GradientBoostingClassifier(random_state=0))
            ],
            voting='soft'
        )
    }

    results = []
    for model_name, model in models.items():
        model.fit(x_train_tfidf, y_train)
        predicted = model.predict(x_test_tfidf)
        accuracy = accuracy_score(y_test, predicted)
        f1 = f1_score(y_test, predicted, average='weighted')
        # conf_matrix = confusion_matrix(y_test, predicted)
        # class_report = classification_report(y_test, predicted,zero_division=0)
        results.append({
            "Model": model_name,
            "Accuracy": accuracy,
            "F1 Score": f1,
            # "Confusion Matrix": conf_matrix,
            # "Classification Report": class_report
        })
        
        print(f"Model: {model_name}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        # print(f"Confusion Matrix:\n{conf_matrix}")
        # print(f"Classification Report:\n{class_report}\n")

    return results

## (1) 모든 단어 사용

In [9]:
# # None

# x_train_tfidf, x_test_tfidf, y_train, y_test = data_preprocessing(num_words=None)
# results = train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test)

In [10]:
# None

x_train_tfidf, x_test_tfidf, y_train, y_test = data_preprocessing(num_words=None)
results_none = train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test)

Model: MultinomialNB
Accuracy: 0.5997
F1 Score: 0.5046
Model: ComplementNB
Accuracy: 0.7649
F1 Score: 0.7347
Model: LogisticRegression
Accuracy: 0.8112
F1 Score: 0.8055




Model: LinearSVC
Accuracy: 0.8028
F1 Score: 0.7990
Model: DecisionTree
Accuracy: 0.6211
F1 Score: 0.5769
Model: RandomForest
Accuracy: 0.6545
F1 Score: 0.6226
Model: GradientBoosting
Accuracy: 0.7680
F1 Score: 0.7627
Model: VotingClassifier
Accuracy: 0.7996
F1 Score: 0.7943


## (2) 5000개 사용

In [11]:
# 5000개

x_train_tfidf, x_test_tfidf, y_train, y_test = data_preprocessing(num_words=5000)
results_5000 = train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test)

Model: MultinomialNB
Accuracy: 0.6732
F1 Score: 0.6013
Model: ComplementNB
Accuracy: 0.7707
F1 Score: 0.7459
Model: LogisticRegression
Accuracy: 0.8059
F1 Score: 0.8000
Model: LinearSVC
Accuracy: 0.7930
F1 Score: 0.7891
Model: DecisionTree
Accuracy: 0.6180
F1 Score: 0.5730
Model: RandomForest
Accuracy: 0.7012
F1 Score: 0.6770
Model: GradientBoosting
Accuracy: 0.7667
F1 Score: 0.7650
Model: VotingClassifier
Accuracy: 0.7952
F1 Score: 0.7921


## (3) 10000개 사용

In [12]:
# 10000개

x_train_tfidf, x_test_tfidf, y_train, y_test = data_preprocessing(num_words=10000)
results_10000 = train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test)

Model: MultinomialNB
Accuracy: 0.6567
F1 Score: 0.5764
Model: ComplementNB
Accuracy: 0.7707
F1 Score: 0.7457
Model: LogisticRegression
Accuracy: 0.8085
F1 Score: 0.8023




Model: LinearSVC
Accuracy: 0.7921
F1 Score: 0.7878
Model: DecisionTree
Accuracy: 0.6202
F1 Score: 0.5776
Model: RandomForest
Accuracy: 0.6741
F1 Score: 0.6429
Model: GradientBoosting
Accuracy: 0.7685
F1 Score: 0.7648
Model: VotingClassifier
Accuracy: 0.7970
F1 Score: 0.7924


- results를 합치려고 했는데 계속 에러발생
- def에서 return을 작성하지 않아 계속 발생한 것으로 확인됨

In [20]:
print(results)  # 함수 반환값 확인

None


In [19]:
print(type(results_none))
print(type(results_10000))
print(type(results))

<class 'NoneType'>
<class 'NoneType'>
<class 'NoneType'>


In [18]:
# 결과 1차 분석
results_analysis = results_none + results_10000 + results # results = results_5000
results_df = pd.DataFrame(results_analysis)

TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

## (4) 3000개 
- Logistic regression 에서 5000과 10000이 성능 향상이 크지 않아서 조금 더 작은 값에서 경향을 한 번 더 진행해보고자 함 

In [13]:
# 3000개

x_train_tfidf, x_test_tfidf, y_train, y_test = data_preprocessing(num_words=3000)
results_3000 = train_and_evaluate_models(x_train_tfidf, x_test_tfidf, y_train, y_test)

Model: MultinomialNB
Accuracy: 0.6647
F1 Score: 0.5868
Model: ComplementNB
Accuracy: 0.7685
F1 Score: 0.7440
Model: LogisticRegression
Accuracy: 0.8072
F1 Score: 0.8008
Model: LinearSVC
Accuracy: 0.8001
F1 Score: 0.7965
Model: DecisionTree
Accuracy: 0.6207
F1 Score: 0.5762
Model: RandomForest
Accuracy: 0.6736
F1 Score: 0.6450
Model: GradientBoosting
Accuracy: 0.7636
F1 Score: 0.7615
Model: VotingClassifier
Accuracy: 0.7943
F1 Score: 0.7903


In [14]:
# 결과 2차 분석
results_analysis = results_none + results_3000 + results_5000  + results_10000
results_df = pd.DataFrame(results_analysis)  

In [15]:
results_df

Unnamed: 0,Model,Accuracy,F1 Score
0,MultinomialNB,0.599733,0.504567
1,ComplementNB,0.764915,0.734653
2,LogisticRegression,0.81122,0.805548
3,LinearSVC,0.80276,0.799022
4,DecisionTree,0.621104,0.576928
5,RandomForest,0.654497,0.622591
6,GradientBoosting,0.768032,0.762704
7,VotingClassifier,0.799644,0.79429
8,MultinomialNB,0.673197,0.60125
9,ComplementNB,0.770703,0.745899


## 분석

### 1) Accuracy 가장 높은 모델
- Logistic regression
  - Vocabulary Size: 3000 - Logistic Regression: 0.8008
  - Vocabulary Size: 5000 - Logistic Regression: 0.8059
  - Vocabulary Size: 10000 - Logistic Regression: 0.8085
  - Vocabulary Size: None - Logistic Regression: 0.8112

- Logistic regression:
    - 현재 벡터화에 TF-IDF, CountVectorizer사용 - 고차원 Sparse data가 형성되는데 이러한 고차원 데이터 학습을 효율적으로 할 수 있는 모델이어서 이러한 결과를 보인것으로 생각됨
    - num_words의 경우 거의 비슷하지만 모든데이터를 사용한 학습에서 가장 높은 점수를 확인함

### 2) F1-Score 분석
- F1 score는 정밀도(Precision)과 재현율(Recall)의 조화평균으로
- 분류 클래스 간 데이터 불균형이 심각할 때 사용
- 높을 수록 좋은 모델

- Logistic regression
    - Vocabulary Size: 3000 - Logistic Regression: 0.8008
    - Vocabulary Size: 5000 - Logistic Regression: 0.7999
    - Vocabulary Size: 10000 - Logistic Regression: 0.8023
    - Vocabulary Size: None - Logistic Regression: 0.8055

- Logistic regression:
    - 각 클래스에 대한 확률 예측하는 모델, 불균형이 큰 데이터 학습에도 예측 성능이 뛰어남

### 3) Vocabulary Size 별 모델 성능

- Logistic Regression: 모든 size에서 안정적으로 학습능력이 확인됨 - Accuracy 및 F1 score 기준으로
- Voting Classifier: Logistic Regression 과 유사하지만 각 모델의 성능의 차이에 따라 약간의 차이는 있어보임
- ComplementNB & Gradient Boosting: 모든 size에서 유사한 성능
- MultinomialNB & Decision Tree: size가 클수록 성능이 떨어짐을 확인, 고차원 데이터에 약한 모델로 여겨짐

### 회고
- 다른 어떤 모델보다 Logistic Regression이 계속해서 안정적으로 점수가 확인되는 것이 인상적이었습니다.\
- 추가적으로 Logistic Regression과 Voting Classifier 하이퍼파라미터 조정으로 조금 더 성능을 높이고도 싶고 딥러닝과도 비교해 보고 싶었는데 생각보다 학습에 시간이 많이 소요되어 아쉬움이 남습니다.