# 날씨데이터(aws)의 8시씩 평균 구하기 알고리즘 
- 총 9개의 날씨 데이터 활용 (기온, 풍향, 풍속, 강수량, 현지기압, 해면기압, 습도, 일사, 일조)
- 8시간 마다의 9개의 날씨 변수의 평균값 데이터를 추가
- 음주운전사고 데이터와 날씨데이터의 통합

In [2]:
import pandas as pd
import numpy as np
from itertools import groupby
import os

## 1. 데이터 불러오기

In [4]:
aws_seoul = pd.read_csv('https://raw.githubusercontent.com/DreamingDataScientist/DataCompetition/master/Data/aws_hour_2017_seoul.csv', encoding ='EUC-KR')
aws_seoul.head()

Unnamed: 0,지점,일시,기온(°C),풍향(deg),풍속(m/s),강수량(mm),현지기압(hPa),해면기압(hPa),습도(%),일사(MJ/m^2),일조(hr)
0,400,2017-01-01 00:00,1.7,68.1,1.3,0.0,,,75.6,0.0,0.0
1,400,2017-01-01 01:00,1.4,69.1,1.3,0.0,,,77.8,0.0,0.0
2,400,2017-01-01 02:00,1.2,66.2,1.6,0.0,,,79.6,0.0,0.0
3,400,2017-01-01 03:00,0.5,66.0,1.5,0.0,,,84.4,0.0,0.0
4,400,2017-01-01 04:00,0.6,63.4,0.5,0.0,,,85.6,0.0,0.0


In [6]:
aws_seoul.columns

Index(['지점', '일시', '기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '현지기압(hPa)',
       '해면기압(hPa)', '습도(%)', '일사(MJ/m^2)', '일조(hr)'],
      dtype='object')

### 시범적으로 '기온 칼럼'을 먼저 8개씩 평균을 구해보기

In [7]:
celsius_8h = []

for i in range(len(aws_seoul)-7):
    val = aws_seoul['기온(°C)'][i:i+8].mean()
    celsius_8h.append(val)
celsius_8h

[0.6875,
 0.4875,
 0.375,
 0.35,
 0.5625,
 0.9625,
 1.65,
 2.45,
 3.2875,
 4.125,
 4.887499999999999,
 5.525,
 5.9625,
 6.1375,
 6.025,
 5.875,
 5.725,
 5.5375,
 5.35,
 5.175,
 5.0375,
 4.975,
 4.949999999999999,
 4.85,
 4.8,
 4.8375,
 4.9375,
 5.137499999999999,
 5.475,
 5.9125,
 6.35,
 6.925,
 7.3875,
 7.725,
 7.9625,
 8.125,
 8.125,
 7.975,
 7.725,
 7.2875,
 6.775,
 6.199999999999999,
 5.6125,
 4.975,
 4.3125,
 3.6374999999999997,
 3.025,
 2.5250000000000004,
 2.125,
 1.8499999999999999,
 1.6625,
 1.6875000000000002,
 1.975,
 2.45,
 3.1375,
 3.8500000000000005,
 4.5625,
 5.2625,
 5.8375,
 6.175000000000001,
 6.2625,
 6.1875,
 5.8999999999999995,
 5.575,
 5.225,
 4.875,
 4.5874999999999995,
 4.4125,
 4.3125,
 4.237500000000001,
 4.1,
 3.9375,
 3.775,
 3.575,
 3.4499999999999997,
 3.475,
 3.725,
 4.074999999999999,
 4.6875,
 5.3125,
 6.0125,
 6.7375,
 7.35,
 7.725,
 7.800000000000001,
 7.7125,
 7.4,
 7.025,
 6.5625,
 6.0625,
 5.6125,
 5.0625,
 4.4625,
 3.8874999999999997,
 3.387499999

### 8개씩 평균을 구하다보면 맨 위의 7행은 값이 없어서 0값 리스트로 대체

In [8]:
none_7h = [0,0,0,0,0,0,0]
celsius_8h = none_7h + celsius_8h

In [10]:
aws_seoul['8h_celsius'] = celsius_8h
aws_seoul.head(10)

Unnamed: 0,지점,일시,기온(°C),풍향(deg),풍속(m/s),강수량(mm),현지기압(hPa),해면기압(hPa),습도(%),일사(MJ/m^2),일조(hr),8h_celsius
0,400,2017-01-01 00:00,1.7,68.1,1.3,0.0,,,75.6,0.0,0.0,0.0
1,400,2017-01-01 01:00,1.4,69.1,1.3,0.0,,,77.8,0.0,0.0,0.0
2,400,2017-01-01 02:00,1.2,66.2,1.6,0.0,,,79.6,0.0,0.0,0.0
3,400,2017-01-01 03:00,0.5,66.0,1.5,0.0,,,84.4,0.0,0.0,0.0
4,400,2017-01-01 04:00,0.6,63.4,0.5,0.0,,,85.6,0.0,0.0,0.0
5,400,2017-01-01 05:00,0.4,63.2,0.8,0.0,,,85.4,0.0,0.0,0.0
6,400,2017-01-01 06:00,-0.2,68.5,1.6,0.0,,,89.2,0.0,0.0,0.0
7,400,2017-01-01 07:00,-0.1,78.6,0.5,0.0,,,89.4,0.0,0.0,0.6875
8,400,2017-01-01 08:00,0.1,67.8,1.1,0.0,,,87.6,0.0,0.0,0.4875
9,400,2017-01-01 09:00,0.5,70.0,1.7,0.0,,,88.6,0.0,0.0,0.375


## 2. 데이터 정제
### 1) 총 9개의 날씨 변수들의 8시간씩의 평균 값 구하기

In [11]:
windD_8h = []
windV_8h = []
precipitation_8h = []
spotAtoPressure_8h = []
seaAtoPressure_8h = []
humidity_8h = []
insolation_8h = []
sunshine_8h = []

# '풍향(deg)', '풍속(m/s)', '강수량(mm)', '현지기압(hPa)',
#        '해면기압(hPa)', '습도(%)', '일사(MJ/m^2)', '일조(hr)'

for i in range(len(aws_seoul)-7):
    val = aws_seoul['풍향(deg)'][i:i+8].mean()
    windD_8h.append(val)
    val = aws_seoul['풍속(m/s)'][i:i+8].mean()
    windV_8h.append(val)
    val = aws_seoul['강수량(mm)'][i:i+8].mean()
    precipitation_8h.append(val)
    val = aws_seoul['현지기압(hPa)'][i:i+8].mean()
    spotAtoPressure_8h.append(val)
    val = aws_seoul['해면기압(hPa)'][i:i+8].mean()
    seaAtoPressure_8h.append(val)
    val = aws_seoul['습도(%)'][i:i+8].mean()
    humidity_8h.append(val)
    val = aws_seoul['일사(MJ/m^2)'][i:i+8].mean()
    insolation_8h.append(val)
    val = aws_seoul['일조(hr)'][i:i+8].mean()
    sunshine_8h.append(val)

#앞의 7시간 0 추가
windD_8h = none_7h + windD_8h
windV_8h = none_7h + windV_8h
precipitation_8h = none_7h + precipitation_8h
spotAtoPressure_8h = none_7h + spotAtoPressure_8h
seaAtoPressure_8h = none_7h + seaAtoPressure_8h
humidity_8h = none_7h + humidity_8h
insolation_8h = none_7h + insolation_8h
sunshine_8h = none_7h + sunshine_8h

### 2) 평균값 구한 리스트를 aws_seoul 데이터프레임 열 추가

In [12]:
aws_seoul['8h_windD'] = windD_8h
aws_seoul['8h_windV'] = windV_8h
aws_seoul['8h_precipitaiton'] = precipitation_8h
aws_seoul['8h_spotAtoPressure'] = spotAtoPressure_8h
aws_seoul['8h_seaAtoPressure'] = seaAtoPressure_8h
aws_seoul['8h_humidity'] = humidity_8h
aws_seoul['8h_insolation'] = insolation_8h
aws_seoul['8h_sunshine'] = sunshine_8h

aws_seoul.head()

Unnamed: 0,지점,일시,기온(°C),풍향(deg),풍속(m/s),강수량(mm),현지기압(hPa),해면기압(hPa),습도(%),일사(MJ/m^2),일조(hr),8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine
0,400,2017-01-01 00:00,1.7,68.1,1.3,0.0,,,75.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,400,2017-01-01 01:00,1.4,69.1,1.3,0.0,,,77.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,400,2017-01-01 02:00,1.2,66.2,1.6,0.0,,,79.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,400,2017-01-01 03:00,0.5,66.0,1.5,0.0,,,84.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,400,2017-01-01 04:00,0.6,63.4,0.5,0.0,,,85.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3) 필용한 8시간 평균 값의 데이터만 추출하여 aws_seoul_8hComp 데이터프레임 생성

In [13]:
aws_seoul_8hComp = aws_seoul[aws_seoul.columns[0:2].append(aws_seoul.columns[11:])]
aws_seoul_8hComp.tail()

Unnamed: 0,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine
244916,889,2017-12-31 20:00,2.525,301.4125,1.375,0.0,1021.6625,1023.8,36.575,0.0,0.0
244917,889,2017-12-31 21:00,2.05,301.8875,1.3625,0.0,1021.9375,1024.0875,36.4875,0.0,0.0
244918,889,2017-12-31 22:00,1.475,302.1625,1.2875,0.0,1022.325,1024.4875,37.725,0.0,0.0
244919,889,2017-12-31 23:00,0.825,290.7,1.175,0.0,1022.725,1024.9,39.95,0.0,0.0
244920,889,2018-01-01 00:00,0.3,286.85,1.125,0.0,1023.125,1025.3125,41.675,0.0,0.0


#####  'index=False'는 기존 index가 1 열로 들어가는 경우를 방지하기 위함

In [15]:
aws_seoul_8hComp.to_csv('aws_8hour_2017_seoul.csv',index=False)

### 4) 음주운전 교통사고데이터를 '강남구' 하나 불러오기
##### engine='python'은 os오류뜰때 해결하기 위해 씀

In [20]:
aws_8h_seoul = pd.read_csv('aws_8hour_2017_seoul.csv')
gangnam_achol_acc = pd.read_csv('accdata/강남구.csv', encoding='EUC-KR', engine='python')

### 5) '일시' 열의 글자 데이터 정제 - lambda 사용

In [19]:
aws_8h_seoul['일시'].head()

0    2017-01-01 00:00
1    2017-01-01 01:00
2    2017-01-01 02:00
3    2017-01-01 03:00
4    2017-01-01 04:00
Name: 일시, dtype: object

In [21]:
aws_8h_seoul['일시'] = aws_8h_seoul['일시'].map(lambda x: x.split(' ')[0].replace('-','') + x.split(' ')[1][:2])

##### 일시에서 string은 빠지고 숫자만 출력됨을 알 수 있음

In [23]:
aws_8h_seoul.head()

Unnamed: 0,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine
0,400,2017010100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,400,2017010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,400,2017010102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,400,2017010103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,400,2017010104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 6) 강남구 음주운전 교통사고의 데이터

In [22]:
gangnam_achol_acc.head()

Unnamed: 0,사고번호,발생일시,발생요일,발생시군구,사고내용,사망자수,중상자수,경상자수,부상신고자수,사고유형,...,기상상태,도로형태,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도
0,2017010100100421,2017년 1월 1일 23시,일요일,서울특별시 강남구 역삼동,경상사고,0,0,1,0,차대사람 - 기타,...,맑음,단일로 - 기타,승용,여,40세,상해없음,보행자,남,26세,경상
1,2017010100100422,2017년 1월 1일 23시,일요일,서울특별시 강남구 개포동,중상사고,0,3,2,0,차대차 - 추돌,...,맑음,교차로 - 교차로부근,승용,남,53세,상해없음,승용,남,43세,중상
2,2017010200100021,2017년 1월 2일 00시,월요일,서울특별시 강남구 신사동,경상사고,0,0,1,0,차대차 - 기타,...,맑음,단일로 - 기타,승용,남,34세,상해없음,승용,남,43세,상해없음
3,2017010200100079,2017년 1월 2일 07시,월요일,서울특별시 강남구 수서동,경상사고,0,0,1,0,차대차 - 추돌,...,맑음,단일로 - 기타,승용,남,26세,상해없음,승용,남,44세,경상
4,2017010600100125,2017년 1월 6일 08시,금요일,서울특별시 강남구 도곡동,경상사고,0,0,2,0,차대차 - 추돌,...,맑음,단일로 - 기타,승용,남,23세,상해없음,승용,여,42세,경상


In [24]:
gangnam_achol_acc.columns

Index(['사고번호', '발생일시', '발생요일', '발생시군구', '사고내용', '사망자수', '중상자수', '경상자수',
       '부상신고자수', '사고유형', '법규위반', '노면상태', '기상상태', '도로형태', '가해운전자 차종',
       '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도', '피해운전자 차종', '피해운전자 성별',
       '피해운전자 연령', '피해운전자 상해정도'],
      dtype='object')

### 7) 교통사고데이터의 '발생 일시' 데이터에 년,월,일,시만 숫자 데이터 추출 

In [25]:
#from itertools import groupby

#숫자만 뽑아내 리스트 반환 함수
def digitList(string):
    l = [int(''.join(i)) for is_digit,i in groupby(string, str.isdigit) if is_digit]
    return l

gangnam_achol_acc['일시'] = gangnam_achol_acc['발생일시'].map(lambda x:
                              str(digitList(x)[0]) + '%02d' % digitList(x)[1] + 
                               '%02d' % digitList(x)[2] + '%02d' % digitList(x)[3])

In [26]:
gangnam_achol_acc['지점'] = 400
gangnam_achol_acc.head()

Unnamed: 0,사고번호,발생일시,발생요일,발생시군구,사고내용,사망자수,중상자수,경상자수,부상신고자수,사고유형,...,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,일시,지점
0,2017010100100421,2017년 1월 1일 23시,일요일,서울특별시 강남구 역삼동,경상사고,0,0,1,0,차대사람 - 기타,...,승용,여,40세,상해없음,보행자,남,26세,경상,2017010123,400
1,2017010100100422,2017년 1월 1일 23시,일요일,서울특별시 강남구 개포동,중상사고,0,3,2,0,차대차 - 추돌,...,승용,남,53세,상해없음,승용,남,43세,중상,2017010123,400
2,2017010200100021,2017년 1월 2일 00시,월요일,서울특별시 강남구 신사동,경상사고,0,0,1,0,차대차 - 기타,...,승용,남,34세,상해없음,승용,남,43세,상해없음,2017010200,400
3,2017010200100079,2017년 1월 2일 07시,월요일,서울특별시 강남구 수서동,경상사고,0,0,1,0,차대차 - 추돌,...,승용,남,26세,상해없음,승용,남,44세,경상,2017010207,400
4,2017010600100125,2017년 1월 6일 08시,금요일,서울특별시 강남구 도곡동,경상사고,0,0,2,0,차대차 - 추돌,...,승용,남,23세,상해없음,승용,여,42세,경상,2017010608,400


### 8) 음주운전 교통사고에 필요한 데이터만 추출하여 temp 데이터 프레임 생성

In [27]:
temp = gangnam_achol_acc[['지점','일시','사고번호','발생요일','발생시군구','사고내용']]
temp.head()

Unnamed: 0,지점,일시,사고번호,발생요일,발생시군구,사고내용
0,400,2017010123,2017010100100421,일요일,서울특별시 강남구 역삼동,경상사고
1,400,2017010123,2017010100100422,일요일,서울특별시 강남구 개포동,중상사고
2,400,2017010200,2017010200100021,월요일,서울특별시 강남구 신사동,경상사고
3,400,2017010207,2017010200100079,월요일,서울특별시 강남구 수서동,경상사고
4,400,2017010608,2017010600100125,금요일,서울특별시 강남구 도곡동,경상사고


In [28]:
aws_8h_seoul.head()

Unnamed: 0,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine
0,400,2017010100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,400,2017010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,400,2017010102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,400,2017010103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,400,2017010104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. 데이터 통합 

### 1) 강남구 교통사고 데이터와 8시간 평균 날씨 데이터 merge

In [29]:
aws_acci_seoul_2017 = pd.merge(aws_8h_seoul, temp, on=['지점','일시'], how='outer')
aws_acci_seoul_2017[3:100]

Unnamed: 0,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine,사고번호,발생요일,발생시군구,사고내용
3,400,2017010103,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
4,400,2017010104,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
5,400,2017010105,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
6,400,2017010106,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
7,400,2017010107,0.6875,67.8875,1.1375,0.0,,,83.3750,0.0,0.0,,,,
8,400,2017010108,0.4875,67.8500,1.1125,0.0,,,84.8750,0.0,0.0,,,,
9,400,2017010109,0.3750,67.9625,1.1625,0.0,,,86.2250,0.0,0.0,,,,
10,400,2017010110,0.3500,67.4500,1.0875,0.0,,,86.9125,0.0,0.0,,,,
11,400,2017010111,0.5625,65.9625,1.0625,0.0,,,86.4625,0.0,0.0,,,,
12,400,2017010112,0.9625,65.0750,1.1250,0.0,,,85.1500,0.0,0.0,,,,


### 2) 모든 구의 음주운전교통사고 데이터를 수집 작업 

#### - 날씨데이터(aws)에서 종로구 대신 북악산에, 동작구 대신 한강 칼럼값으로 기입되어있음
#### - IGNORE_INDEX를 TRUE를 안하게 되면 파일 마다 인덱스가 다시 1부터 시작 됨. 방지하기 위해 False를 씀

In [30]:
aws_number = ['강남 (400)', '강동 (402)', '강북 (424)', '강서 (404)', '관악 (509)',
              '광진 (413)', '구로 (423)', '금천 (417)', '기상청 (410)', '남현 (425)',
              '노원 (407)', '도봉 (406)','동대문 (408)', '마포 (411)', '종로 (422)',
              '북한산 (420)', '서대문 (412)', '서초 (401)', '성동 (421)', '성북 (414)',
              '송파 (403)', '양천 (405)', '영등포 (510)','용산 (415)', 
              '은평 (416)', '중구 (419)', '중랑 (409)', '동작 (418)', '현충원 (889)']


#북악산 -- 종로구
#한강 -- 동작구

temp = pd.DataFrame()

#import os
for number in aws_number:
#     print(number.split('(')[0].strip()+ '구.csv')
#     print(number.split('(')[1][:3])
    if os.path.exists('accdata/'+ number.split('(')[0].strip() + '구.csv'):
        df = pd.read_csv('accdata/'+ number.split('(')[0].strip() + '구.csv',
                         engine='python', encoding='EUC-KR')
        df['지점'] = number.split('(')[1][:3]
        df['일시'] = df['발생일시'].map(lambda x:
                              str(digitList(x)[0]) + '%02d' % digitList(x)[1] + 
                               '%02d' % digitList(x)[2] + '%02d' % digitList(x)[3])
        df = df[['지점','일시','사고번호','발생요일','발생시군구','사고내용']]
        temp = temp.append(df, ignore_index = True)

In [31]:
temp.dtypes

지점       object
일시       object
사고번호      int64
발생요일     object
발생시군구    object
사고내용     object
dtype: object

### 3) '지점일시'라는 새로운 열을 만들어 날씨 시간과 교통사고 시간의 동시점을 찾기위한 key로 활용
- 열끼리 글자를 합치기 위해 지점과 일시는 스트링 타입으로 먼저 선언함

In [32]:
aws_8h_seoul['지점'] = aws_8h_seoul['지점'].astype('str')
aws_8h_seoul['일시'] = aws_8h_seoul['일시'].astype('str')
aws_8h_seoul['지점일시'] = aws_8h_seoul['지점'] + aws_8h_seoul['일시']
aws_8h_seoul.columns

Index(['지점', '일시', '8h_celsius', '8h_windD', '8h_windV', '8h_precipitaiton',
       '8h_spotAtoPressure', '8h_seaAtoPressure', '8h_humidity',
       '8h_insolation', '8h_sunshine', '지점일시'],
      dtype='object')

### 4) 날씨데이터의 지점일시 열을 맨 앞으로 추가

In [33]:
aws_8h_seoul = aws_8h_seoul[['지점일시', '지점', '일시', '8h_celsius',
                                     '8h_windD', '8h_windV', '8h_precipitaiton',
                                     '8h_spotAtoPressure', '8h_seaAtoPressure',
                                    '8h_humidity','8h_insolation', '8h_sunshine']]
aws_8h_seoul.head()

Unnamed: 0,지점일시,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine
0,4002017010100,400,2017010100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4002017010101,400,2017010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4002017010102,400,2017010102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4002017010103,400,2017010103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4002017010104,400,2017010104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 5) 서울시 전체 구를 대상으로 날씨데이터와 음주운전교통사고 데이터의 merge 

In [34]:
aws_acci_seoul_2017 = pd.merge(aws_8h_seoul, temp, how='outer')
aws_acci_seoul_2017[3:100]

Unnamed: 0,지점일시,지점,일시,8h_celsius,8h_windD,8h_windV,8h_precipitaiton,8h_spotAtoPressure,8h_seaAtoPressure,8h_humidity,8h_insolation,8h_sunshine,사고번호,발생요일,발생시군구,사고내용
3,4002017010103,400,2017010103,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
4,4002017010104,400,2017010104,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
5,4002017010105,400,2017010105,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
6,4002017010106,400,2017010106,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0,,,,
7,4002017010107,400,2017010107,0.6875,67.8875,1.1375,0.0,,,83.3750,0.0,0.0,,,,
8,4002017010108,400,2017010108,0.4875,67.8500,1.1125,0.0,,,84.8750,0.0,0.0,,,,
9,4002017010109,400,2017010109,0.3750,67.9625,1.1625,0.0,,,86.2250,0.0,0.0,,,,
10,4002017010110,400,2017010110,0.3500,67.4500,1.0875,0.0,,,86.9125,0.0,0.0,,,,
11,4002017010111,400,2017010111,0.5625,65.9625,1.0625,0.0,,,86.4625,0.0,0.0,,,,
12,4002017010112,400,2017010112,0.9625,65.0750,1.1250,0.0,,,85.1500,0.0,0.0,,,,


### 6) 음주사고가 발생한 건수는 2825개

In [35]:
aws_acci_seoul_2017['음주사고발생'] = aws_acci_seoul_2017['사고번호'].notnull()
len(aws_acci_seoul_2017[aws_acci_seoul_2017['음주사고발생']==True])

2825

## 4. 데이터 정제 작업 끝
- 정제된 데이터 저장

In [36]:
aws_acci_seoul_2017.to_csv('aws_acci_seoul_2017.csv',index=False)