## 2022 HDAT-DA (Hyundai motor group Data Analytic test) GOSMode 해설

### 위성정보 수신 상태 정보인 GPSMode 정보에 대한 상세 데이터 분석 및 이를 통한 위성 정보 수신 상태(GPSMode) 예측

#### 1. 데이터 불러오기

In [None]:
pip install pandas

In [None]:
import pandas as pd
import numpy as np
import os, random

def seed_everything(seed: int=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

seed_everything()

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("simple_submission.csv")

In [None]:
train

In [None]:
test

In [None]:
submission

#### 2. 기초 통계 분석 & EDA

In [None]:
print(train.columns)
print('column 개수는', len(train.columns), '개 입니다.')

In [None]:
train.describe()

In [None]:
train['GPSMode'].value_counts() # 타겟 변수 값 살펴보기(y값)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

sns.histplot(train['GPSMode'], bins= 13)
plt.show()

In [None]:
temp = train.isna().sum()
temp[temp>0] #결측치 갯수 측정

상관관계 분석

In [None]:
temp = train.corr()
mask = np.zeros_like(temp)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplot(figsize=(20,20))
sns.heatmap(temp, cmap='RdYlBu_r', annot=False, mask=mask, linewidths=0.5, cbar_kws={"shrink": 0.5}, vmin=-1, vmax=1)
plt.show()

In [None]:
train[['DriveMode', 'Inhibit_D', 'Inhibit_N', 'Inhibit_P', 'Inhibit_R']].value_counts() #다른 변수들과 상관관계 분석 결과 분석값이 0이 나온 변수들, 원인은 모든 행에 대해 같은 값을 가짐

#### 3. 데이터 전처리

다중공선성 검사를 통해 분상팽창계수가 10이상인 컬럼 제거

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

drop_columns = ['id'] #id 변수는 제거

drop_columns.extend(['DriveMode', 'Inhibit_D', 'Inhibit_N', 'Inhibit_P', 'Inhibit_R']) #쓸데없는 변수 제거

feature = train.drop(['HevMode'], axis=1)
vif = pd.DataFrame()

Vif['VIF Factor'] = [variance_inflation_factor(feature.values, i) for i in range(feature.shape[1])]
vif['features'] = feature.columns
picked = list(vif[vif['VIF Factor'] > 10].features)
drop_columns.extend(picked)

train_drop = train_drop(columns=drop_columns)
test_drop = test.drop(columns=drop_columns)

#### 4. 분석 모델 설계 및 예측

Logistic Regression

In [None]:
train_x, train_y = train_drop.drop('GPSMode', axis=1), train_drop['GPSMode'] # x, y 변수 할당

In [None]:
from sklearn.linear_model import LogisticRegression

Logistic_model = LogisticRegression()

Logistic_model.fit(train_x, train_y)

Logistic_prediction = Logistic_model.predict(test_drop)

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier # 분류 문제이므로 classifier import

rf_model = RandomForestClassifier()

rf_model.fit(train_x, train_y)

rf_prediction = rf_model.predict(test_drop)

#### 5. 제출 파일 생성

In [None]:
Logistic_submission = submission.copy()
rf_submission = submission.copy()

Logistic_submission['GPSMode'] = Logistic_prediction
rf_submission['GPSMode'] = rf_prediction

In [None]:
Logistic_submission.to_csv('Logistic_submission.csv', index=False)
rf_submission.to_csv('rf_submission.csv', index=False)