<a href="https://colab.research.google.com/github/ChoRockKim/2025-2-Semester-Machine-Learning-Team-Project/blob/main/LogisticRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# 깃허브에서 AI vs Human 데이터 다운로드
!git clone https://github.com/panagiotisanagnostou/AI-GA

# CSV 파일 다운로드
data = pd.read_csv('/content/AI-GA/ai-ga-dataset.csv')

data.head()


Cloning into 'AI-GA'...
remote: Enumerating objects: 21, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 21 (delta 9), reused 14 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (21/21), 13.84 MiB | 10.02 MiB/s, done.
Resolving deltas: 100% (9/9), done.


Unnamed: 0,title,abstract,label
0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,0
1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,0
2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,0
3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,0
4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,0


In [None]:
# nltk 라이브러리로 전처리 준비
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# 전처리 로직 적용
# 토크나이징, 소문자 변환, stopwords 제거, 알파벳/숫자 필터링, stemming
def preprocess_data(text):
  tokens = nltk.word_tokenize(text)
  stemmer = PorterStemmer()
  processed_tokens = [stemmer.stem(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stopwords.words('english')]

  return ' '.join(processed_tokens)

data['full_text'] = data['title'] + ' ' + data['abstract']
data['processed_text'] = data['full_text'].apply(preprocess_data) # full_text를 전처리
data.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,title,abstract,label,full_text,processed_text
0,Clinical features of culture-proven Mycoplasma...,OBJECTIVE: This retrospective chart review des...,0,Clinical features of culture-proven Mycoplasma...,clinic featur mycoplasma pneumonia infect king...
1,Nitric oxide: a pro-inflammatory mediator in l...,Inflammatory diseases of the respiratory tract...,0,Nitric oxide: a pro-inflammatory mediator in l...,nitric oxid mediat lung diseas inflammatori di...
2,Surfactant protein-D and pulmonary host defense,Surfactant protein-D (SP-D) participates in th...,0,Surfactant protein-D and pulmonary host defens...,surfact pulmonari host defens surfact particip...
3,Role of endothelin-1 in lung disease,Endothelin-1 (ET-1) is a 21 amino acid peptide...,0,Role of endothelin-1 in lung disease Endotheli...,role lung diseas 21 amino acid peptid divers b...
4,Gene expression in epithelial cells in respons...,Respiratory syncytial virus (RSV) and pneumoni...,0,Gene expression in epithelial cells in respons...,gene express epitheli cell respons pneumoviru ...


In [None]:
from sklearn.model_selection import train_test_split

# 전체 데이터를 훈련 세트와 테스트 세트로 분할
train_data, temp_test_data = train_test_split(data, test_size=0.3, random_state=42)

# 임시 테스트 세트를 검증 세트와 테스트 세트 (각각 10%) 로 분할
val_data, test_data = train_test_split(temp_test_data, test_size=0.5, random_state=42)

print(f"훈련 데이터셋 크기: {len(train_data)}")
print(f"검증 데이터셋 크기: {len(val_data)}")
print(f"테스트 데이터셋 크기: {len(test_data)}")

훈련 데이터셋 크기: 20063
검증 데이터셋 크기: 4299
테스트 데이터셋 크기: 4300


In [None]:
# Bag-of-Words 방식으로 벡터화 (Only 단어의 빈도수)

from sklearn.feature_extraction.text import CountVectorizer

# bigram 으로 고려, 노이즈 제거(최소 2회 이상 어휘만 포함)
vectorizer = CountVectorizer(ngram_range=(1, 2), min_df= 5)
X = vectorizer.fit_transform(train_data['processed_text'])

# 텍스트문서에 나타난 어휘의 집합
print(vectorizer.vocabulary_)
# 어휘가 몇 개인지
print(len(vectorizer.vocabulary_))

#
train_vectors = vectorizer.transform(train_data['processed_text'])
val_vectors = vectorizer.transform(val_data['processed_text'])
test_vectors = vectorizer.transform(test_data['processed_text'])

print(train_vectors[0])

{'bilater': 9033, 'massiv': 44344, 'pneumonia': 54981, 'unusu': 78195, 'manifest': 44155, 'puumala': 59846, 'hantaviru': 32155, 'infect': 37435, 'renal': 62524, 'involv': 40353, 'due': 22461, 'european': 25662, 'viru': 80700, 'puuv': 59848, 'frequent': 29702, 'pulmonari': 59711, 'quit': 60075, 'rare': 60338, 'present': 56718, 'male': 43877, 'atyp': 7750, 'clinic': 13039, 'acut': 2178, 'gross': 31587, 'minim': 46459, 'sever': 67214, 'case': 10596, 'highlight': 33201, 'consid': 15639, 'differenti': 20730, 'diagnosi': 20161, 'unusu manifest': 78200, 'hantaviru infect': 32156, 'renal involv': 62534, 'puumala viru': 59847, 'pulmonari involv': 59741, 'rare present': 60355, 'present male': 56866, 'clinic present': 13161, 'present acut': 56722, 'involv sever': 40469, 'sever pulmonari': 67398, 'pulmonari manifest': 59744, 'infect case': 37519, 'case highlight': 10670, 'infect consid': 37557, 'consid differenti': 15660, 'differenti diagnosi': 20739, 'atyp pneumonia': 7753, 'dynam': 22683, '35': 

In [None]:
# 분류모델, 로지스틱 회귀로 훈련
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, C=0.1)

# 모델 훈련 (훈련 데이터의 BOW 벡터와 레이블 사용)
# train_data['label']은 타겟 변수
model.fit(train_vectors, train_data['label'])

print("모델 훈련 완료.")

모델 훈련 완료.


In [None]:
# 5. 모델 평가

from sklearn.metrics import accuracy_score, classification_report

# 검증 데이터에 대한 예측
val_predictions = model.predict(val_vectors)

# 검증 데이터에 대한 정확도 평가
accuracy = accuracy_score(val_data['label'], val_predictions)
print(f"검증 데이터 정확도: {accuracy:.4f}")

# 검증 데이터에 대한 상세 평가 리포트
print("\n검증 데이터 Classification Report:")
print(classification_report(val_data['label'], val_predictions))

test_predictions = model.predict(test_vectors)
test_accuracy = accuracy_score(test_data['label'], test_predictions)
print(f"\n테스트 데이터 정확도: {test_accuracy:.4f}")
print("\n테스트 데이터 Classification Report:")
print(classification_report(test_data['label'], test_predictions))

검증 데이터 정확도: 0.9714

검증 데이터 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2130
           1       0.97      0.98      0.97      2169

    accuracy                           0.97      4299
   macro avg       0.97      0.97      0.97      4299
weighted avg       0.97      0.97      0.97      4299


테스트 데이터 정확도: 0.9667

테스트 데이터 Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97      2139
           1       0.96      0.97      0.97      2161

    accuracy                           0.97      4300
   macro avg       0.97      0.97      0.97      4300
weighted avg       0.97      0.97      0.97      4300



In [None]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix for the test data
cm = confusion_matrix(test_data['label'], test_predictions)

# Extract TP, TN, FP, FN
tn, fp, fn, tp = cm.ravel()

print(f"Confusion Matrix:\n{cm}")
print(f"\nTrue Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")

Confusion Matrix:
[[2058   81]
 [  62 2099]]

True Positives (TP): 2099
False Positives (FP): 81
True Negatives (TN): 2058
False Negatives (FN): 62


In [None]:
results_df = test_data.copy()
results_df['prediction'] = test_predictions

# Extract TN, FP, FN, TP cases
tn_cases = results_df[(results_df['label'] == 0) & (results_df['prediction'] == 0)]
fp_cases = results_df[(results_df['label'] == 0) & (results_df['prediction'] == 1)]
fn_cases = results_df[(results_df['label'] == 1) & (results_df['prediction'] == 0)]
tp_cases = results_df[(results_df['label'] == 1) & (results_df['prediction'] == 1)]

print(f"Total test samples: {len(results_df)}")
print(f"True Negatives (TN): {len(tn_cases)}\n{tn_cases.head()}\n")
print(f"False Positives (FP): {len(fp_cases)}\n{fp_cases.head()}\n")
print(f"False Negatives (FN): {len(fn_cases)}\n{fn_cases.head()}\n")
print(f"True Positives (TP): {len(tp_cases)}\n{tp_cases.head()}\n")

Total test samples: 4300
True Negatives (TN): 2058
                                                   title  \
3922   A pilot study—genetic diversity and population...   
2527   Severe acute respiratory infection caused by s...   
928    Evolutionary Dynamics of the Interferon-Induce...   
17980  National reporting of deaths after enhanced Eb...   
18246  Overview of the Development, Impacts, and Chal...   

                                                abstract  label  \
3922   BACKGROUND: The Hindu Kush and Karakoram mount...      0   
2527   In October 2016, a severe infection with swine...      0   
928    Vertebrate interferon-induced transmembrane (I...      0   
17980  BACKGROUND: Sierra Leone experienced the large...      0   
18246  Safety, efficacy, and cost-effectiveness are p...      0   

                                               full_text  \
3922   A pilot study—genetic diversity and population...   
2527   Severe acute respiratory infection caused by s...   
928  

In [None]:
model_name = 'LogisticRegression'
layers = 0  # Assuming 0 layers for a simple Logistic Regression model
epochs = 1000  # From the LogisticRegression max_iter parameter
seed = 42 # From the train_test_split random_state parameter

# Save TN cases
tn_filename = f"{model_name}_00_layer{layers}_epoch{epochs}_sd{seed}_TN.csv"
tn_cases.to_csv(tn_filename, index=False)
print(f"Saved TN cases to {tn_filename}")

# Save FP cases
fp_filename = f"{model_name}_01_layer{layers}_epoch{epochs}_sd{seed}_FP.csv"
fp_cases.to_csv(fp_filename, index=False)
print(f"Saved FP cases to {fp_filename}")

# Save FN cases
fn_filename = f"{model_name}_10_layer{layers}_epoch{epochs}_sd{seed}_FN.csv"
fn_cases.to_csv(fn_filename, index=False)
print(f"Saved FN cases to {fn_filename}")

# Save TP cases
tp_filename = f"{model_name}_11_layer{layers}_epoch{epochs}_sd{seed}_TP.csv"
tp_cases.to_csv(tp_filename, index=False)
print(f"Saved TP cases to {tp_filename}")

Saved TN cases to LogisticRegression_00_layer0_epoch1000_sd42_TN.csv
Saved FP cases to LogisticRegression_01_layer0_epoch1000_sd42_FP.csv
Saved FN cases to LogisticRegression_10_layer0_epoch1000_sd42_FN.csv
Saved TP cases to LogisticRegression_11_layer0_epoch1000_sd42_TP.csv
