In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

'''
[Step1] 데이터 준비 - read_csv() 함수로 자동차 연비 데이터셋 가져오기
'''

In [3]:
# csv 파일을 데이터 프레임으로 변환
df = pd.read_csv('./auto-mpg.csv', header=None)

# 열이름 지정
df.columns = ['mpg','cylinders','displacement','horsement','weight','acceleration','model year','origin','name']

In [4]:
# 데이터 살펴보기
print(df.head())
print('\n')

    mpg  cylinders  displacement horsement  weight  acceleration  model year  \
0  18.0          8         307.0     130.0  3504.0          12.0          70   
1  15.0          8         350.0     165.0  3693.0          11.5          70   
2  18.0          8         318.0     150.0  3436.0          11.0          70   
3  16.0          8         304.0     150.0  3433.0          12.0          70   
4  17.0          8         302.0     140.0  3449.0          10.5          70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  




In [5]:
# Ipython 디스플레이 설정 - 출력할 열의 개수 한도 늘리기
pd.set_option('display.max_columns',10)
print(df.head())

    mpg  cylinders  displacement horsement  weight  acceleration  model year  \
0  18.0          8         307.0     130.0  3504.0          12.0          70   
1  15.0          8         350.0     165.0  3693.0          11.5          70   
2  18.0          8         318.0     150.0  3436.0          11.0          70   
3  16.0          8         304.0     150.0  3433.0          12.0          70   
4  17.0          8         302.0     140.0  3449.0          10.5          70   

   origin                       name  
0       1  chevrolet chevelle malibu  
1       1          buick skylark 320  
2       1         plymouth satellite  
3       1              amc rebel sst  
4       1                ford torino  


* Step2 데이터 탐색

In [6]:
# 데이터 자료형 확인
print(df.info())
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsement     398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
None




In [7]:
# 데이터 통계 요약 정보 확인
print(df.describe())

              mpg   cylinders  displacement       weight  acceleration  \
count  398.000000  398.000000    398.000000   398.000000    398.000000   
mean    23.514573    5.454774    193.425879  2970.424623     15.568090   
std      7.815984    1.701004    104.269838   846.841774      2.757689   
min      9.000000    3.000000     68.000000  1613.000000      8.000000   
25%     17.500000    4.000000    104.250000  2223.750000     13.825000   
50%     23.000000    4.000000    148.500000  2803.500000     15.500000   
75%     29.000000    8.000000    262.000000  3608.000000     17.175000   
max     46.600000    8.000000    455.000000  5140.000000     24.800000   

       model year      origin  
count  398.000000  398.000000  
mean    76.010050    1.572864  
std      3.697627    0.802055  
min     70.000000    1.000000  
25%     73.000000    1.000000  
50%     76.000000    1.000000  
75%     79.000000    2.000000  
max     82.000000    3.000000  


In [10]:
# horsepower 열의 자료형 변경(문자열 -> 숫자)
print(df['horsement'].unique())        # horsepower 열의 고유값 확인
print('\n')

df['horsement'].replace('?', np.nan, inplace=True)      #'?'을 np.nan으로 변경
df.dropna(subset=['horsement'], axis=0, inplace=True)   # 누락 데이터 행 삭제
df['horsement'] = df['horsement'].astype('float')       # 문자열을 실수형으로 변환

print(df.describe())

['130.0' '165.0' '150.0' '140.0' '198.0' '220.0' '215.0' '225.0' '190.0'
 '170.0' '160.0' '95.00' '97.00' '85.00' '88.00' '46.00' '87.00' '90.00'
 '113.0' '200.0' '210.0' '193.0' '?' '100.0' '105.0' '175.0' '153.0'
 '180.0' '110.0' '72.00' '86.00' '70.00' '76.00' '65.00' '69.00' '60.00'
 '80.00' '54.00' '208.0' '155.0' '112.0' '92.00' '145.0' '137.0' '158.0'
 '167.0' '94.00' '107.0' '230.0' '49.00' '75.00' '91.00' '122.0' '67.00'
 '83.00' '78.00' '52.00' '61.00' '93.00' '148.0' '129.0' '96.00' '71.00'
 '98.00' '115.0' '53.00' '81.00' '79.00' '120.0' '152.0' '102.0' '108.0'
 '68.00' '58.00' '149.0' '89.00' '63.00' '48.00' '66.00' '139.0' '103.0'
 '125.0' '133.0' '138.0' '135.0' '142.0' '77.00' '62.00' '132.0' '84.00'
 '64.00' '74.00' '116.0' '82.00']


              mpg   cylinders  displacement   horsement       weight  \
count  392.000000  392.000000    392.000000  392.000000   392.000000   
mean    23.445918    5.471939    194.411990  104.469388  2977.584184   
std      7.805007    1