# DataFrame 형성

In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.metrics import classification_report

In [31]:
df = pd.read_csv('./data/alzheimers_disease_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [32]:

df = df[['Gender' , 'AlcoholConsumption', 'PhysicalActivity', 'SleepQuality', 'FamilyHistoryAlzheimers', 'CholesterolTriglycerides','DietQuality', 'BMI', 'MemoryComplaints', 'Diagnosis' ]]

In [33]:
df.groupby('FamilyHistoryAlzheimers')['Diagnosis'].value_counts()

FamilyHistoryAlzheimers  Diagnosis
0                        0            1024
                         1             583
1                        0             365
                         1             177
Name: count, dtype: int64

In [34]:
df.describe()

Unnamed: 0,Gender,AlcoholConsumption,PhysicalActivity,SleepQuality,FamilyHistoryAlzheimers,CholesterolTriglycerides,DietQuality,BMI,MemoryComplaints,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,0.506282,10.039442,4.920202,7.051081,0.25221,228.281496,4.993138,27.655697,0.208004,0.353653
std,0.500077,5.75791,2.857191,1.763573,0.434382,101.986721,2.909055,7.217438,0.405974,0.478214
min,0.0,0.002003,0.003616,4.002629,0.0,50.407194,0.009385,15.008851,0.0,0.0
25%,0.0,5.13981,2.570626,5.482997,0.0,137.583222,2.458455,21.611408,0.0,0.0
50%,1.0,9.934412,4.766424,7.115646,0.0,230.301983,5.076087,27.823924,0.0,0.0
75%,1.0,15.157931,7.427899,8.562521,1.0,314.839046,7.558625,33.869778,0.0,1.0
max,1.0,19.989293,9.987429,9.99984,1.0,399.941862,9.998346,39.992767,1.0,1.0


## 1. LogisticRegression 전체 - Scaling 없이


In [35]:
# X와 y 데이터 분리
X = df.drop(['Diagnosis'],axis=1)
y = df['Diagnosis']

# 훈련 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 모델 객체 생성 및 학습 & 평가
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# 모델 성능 평가
print('LogisticRegression(Scaling 없이)')
print('학습 점수 :', lr.score(X_train,y_train))
print('테스트 점수:', lr.score(X_test,y_test))
print("\n📋 분류 리포트:\n", classification_report(y_test, y_pred))


LogisticRegression(Scaling 없이)
학습 점수 : 0.7051520794537555
테스트 점수: 0.7026022304832714

📋 분류 리포트:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       342
           1       0.66      0.39      0.49       196

    accuracy                           0.70       538
   macro avg       0.69      0.64      0.64       538
weighted avg       0.69      0.70      0.68       538



## 2. LogisticRegression 전체 - StandardScaler 0

In [36]:


X = df.drop('Diagnosis',axis=1)
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler1 = StandardScaler()

X_train_scaled = scaler1.fit_transform(X_train)
X_test_scaled = scaler1.transform(X_test)

lr1 = LogisticRegression(max_iter=1000)
lr1.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

# 모델 성능 평가
print('LogisticRegression( StandardScaling O )')
print('학습 점수 :', lr.score(X_train_scaled,y_train))
print('테스트 점수:', lr.score(X_test_scaled,y_test))
print("\n📋 분류 리포트:\n", classification_report(y_test, y_pred))

LogisticRegression( StandardScaling O )
학습 점수 : 0.7051520794537555
테스트 점수: 0.7026022304832714

📋 분류 리포트:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       342
           1       0.66      0.39      0.49       196

    accuracy                           0.70       538
   macro avg       0.69      0.64      0.64       538
weighted avg       0.69      0.70      0.68       538





## 3. LogisticRegression 전체 - MinMaxScaler


In [37]:

X = df.drop(['Diagnosis'],axis=1)
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(max_iter=1000)

lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

# 모델 성능 평가
print('LogisticRegression( StandardScaling O )')
print('학습 점수 :', lr.score(X_train_scaled,y_train))
print('테스트 점수:', lr.score(X_test_scaled,y_test))
print("\n📋 분류 리포트:\n", classification_report(y_test, y_pred))

LogisticRegression( StandardScaling O )
학습 점수 : 0.7051520794537555
테스트 점수: 0.7026022304832714

📋 분류 리포트:
               precision    recall  f1-score   support

           0       0.72      0.88      0.79       342
           1       0.66      0.39      0.49       196

    accuracy                           0.70       538
   macro avg       0.69      0.64      0.64       538
weighted avg       0.69      0.70      0.68       538



## 4. LogisticRegression (MemoryComplaints 없이) - Scaling 없이

In [38]:

X = df.drop(['Diagnosis','MemoryComplaints'],axis=1)
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# 모델 성능 평가
print('LogisticRegression( StandardScaling X )')
print('학습 점수 :', lr.score(X_train,y_train))
print('테스트 점수:', lr.score(X_test,y_test))
print("\n📋 분류 리포트:\n", classification_report(y_test, y_pred))

LogisticRegression( StandardScaling X )
학습 점수 : 0.6499068901303539
테스트 점수: 0.6356877323420075

📋 분류 리포트:
               precision    recall  f1-score   support

           0       0.64      1.00      0.78       342
           1       0.00      0.00      0.00       196

    accuracy                           0.64       538
   macro avg       0.32      0.50      0.39       538
weighted avg       0.40      0.64      0.49       538



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 5. LogisticRegression (MemoryComplaints 없이) - StandardScaler 0


In [39]:

X = df.drop(['Diagnosis','MemoryComplaints'],axis=1)
y = df['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression()

lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

# 모델 성능 평가
print('LogisticRegression( MemoryComplaints ,StandardScaling O )')
print('학습 점수 :', lr.score(X_train_scaled,y_train))
print('테스트 점수:', lr.score(X_test_scaled,y_test))
print("\n📋 분류 리포트:\n", classification_report(y_test, y_pred))

LogisticRegression( MemoryComplaints ,StandardScaling O )
학습 점수 : 0.6499068901303539
테스트 점수: 0.6356877323420075

📋 분류 리포트:
               precision    recall  f1-score   support

           0       0.64      1.00      0.78       342
           1       0.00      0.00      0.00       196

    accuracy                           0.64       538
   macro avg       0.32      0.50      0.39       538
weighted avg       0.40      0.64      0.49       538



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 6. LogisticRegression (MemoryComplaints 없이) - MinMaxScaler 0

In [40]:

test_df5 = df.copy()

X = test_df5.drop(['Diagnosis','MemoryComplaints'],axis=1)
y = test_df5['Diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression()

lr.fit(X_train_scaled, y_train)
lr.score(X_train_scaled, y_train), lr.score(X_test_scaled, y_test)

(0.6499068901303539, 0.6356877323420075)

## 7. PCA 이용 전체 데이터셋

In [41]:
test_df6 = df.copy()

X = test_df6.drop(['Diagnosis'],axis=1)
y = test_df6['Diagnosis']

pca = PCA(n_components=4)
X_pca = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, random_state=42)

lr = LogisticRegression()

lr.fit(X_train, y_train)
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.6499068901303539, 0.6356877323420075)