In [2]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 데이터 로드
boston = fetch_california_housing()
data = pd.DataFrame(boston.data, columns=boston.feature_names)
data['target'] = boston.target

# 데이터 샘플 확인
print(data.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  


In [4]:
# 결측치 생성 및 처리
data.fillna(data.mean(), inplace=True)

print(data.isnull().sum())  # 결측치 확인

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64


In [5]:
# 회귀 문제 데이터는 모두 수치형이므로 별도 처리 불필요
X = data.drop(columns=['target'])
y = data['target']

In [6]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측 수행
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse:.2f}')

Mean Squared Error: 0.56


In [7]:
# 기존 submission.csv 파일 생성(예시용)
submission = pd.DataFrame({'Id': X_test.index, 'Predicted': y_pred})
# submission.to_csv('submission_regression.csv', index=False)

# # 저장된 파일 확인
# print(submission.head())

## Titanic

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
# 타이타닉 데이터 불러오기
data = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# 데이터 샘플 확인
print(data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [14]:
# 'Age'의 결측치는 평균으로 채우기
data['Age'].fillna(data['Age'].mean(), inplace=True)

# 'Embarked'의 결측치는 최빈값으로 채우기
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# 사용할 피처 선택
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
X = data[features]
y = data['Survived']

print(X.head())  # 선택된 피처 확인

   Pclass     Sex   Age     Fare Embarked
0       3    male  22.0   7.2500        S
1       1  female  38.0  71.2833        C
2       3  female  26.0   7.9250        S
3       1  female  35.0  53.1000        S
4       3    male  35.0   8.0500        S


In [15]:
# 'Sex'와 'Embarked'는 범주형 변수 -> One-Hot Encoding 적용
X_encoded = pd.get_dummies(X, columns=['Sex', 'Embarked'])

print(X_encoded.head())  # One-Hot Encoding 결과 확인

   Pclass   Age     Fare  Sex_female  Sex_male  Embarked_C  Embarked_Q  \
0       3  22.0   7.2500       False      True       False       False   
1       1  38.0  71.2833        True     False        True       False   
2       3  26.0   7.9250        True     False       False       False   
3       1  35.0  53.1000        True     False       False       False   
4       3  35.0   8.0500       False      True       False       False   

   Embarked_S  
0        True  
1       False  
2        True  
3        True  
4        True  


In [16]:
# 데이터 분할 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 로지스틱 회귀 모델 학습
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# 예측 수행
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.80


In [17]:
submission = pd.DataFrame({'Id': X_test.index, 'Survived': y_pred})
# submission.to_csv('submission_titanic.csv', index=False)

# # 저장된 파일 확인
# print(submission.head())