# 데이터 프레임 다루기

# 데이터 프레임 불러오기

### csv : 콤마(,)를 기준으로 데이터를 구분함
### tsv : 탭(Tab)을 기준으로 데이터를 구분함

In [2]:
import pandas as pd

df = pd.read_csv("gapminder.tsv", sep = '\t')    # "gapminder.tsv"파일을 가져온 후 'Tab'키를 기준으로 구분 (\t = Tab)
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


# 열에 접근하기

In [4]:
df['country']

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [5]:
df[['country', 'year']]    # 2개 이상의 columns를 불러올 경우 []로 한번 더 묶어준다

Unnamed: 0,country,year
0,Afghanistan,1952
1,Afghanistan,1957
2,Afghanistan,1962
3,Afghanistan,1967
4,Afghanistan,1972
...,...,...
1699,Zimbabwe,1987
1700,Zimbabwe,1992
1701,Zimbabwe,1997
1702,Zimbabwe,2002


# 행에 접근하기

In [7]:
df.loc[0]                                           # 0이라고 적힌 행 가져오기

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [8]:
df.iloc[0]                                          # 0번째 행 가져오기

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [11]:
df.loc[[0, 10, 100, 1000], ['country', 'year']]     # 0, 10, 100, 1000이라고 적힌 행과 country, year이라고 적힌 열 가져오기

Unnamed: 0,country,year
0,Afghanistan,1952
10,Afghanistan,2002
100,Bangladesh,1972
1000,Mongolia,1972


In [13]:
df.iloc[[0, 10, 100, 1000], [0, 2]]                 # 0, 10, 100, 1000번째 행과 0, 2번째 열 가져오기

Unnamed: 0,country,year
0,Afghanistan,1952
10,Afghanistan,2002
100,Bangladesh,1972
1000,Mongolia,1972


# 원하는 행만 추출하기

In [16]:
df[(df['year'] == 1952) & (df['continent'] == 'Asia')]    # A & B : 둘 다 충족 시 추출, A | B : A와 B중 하나만 충족해도 추출

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
84,Bahrain,Asia,1952,50.939,120447,9867.084765
96,Bangladesh,Asia,1952,37.484,46886859,684.244172
216,Cambodia,Asia,1952,39.417,4693836,368.469286
288,China,Asia,1952,44.0,556263527,400.448611
660,"Hong Kong, China",Asia,1952,60.96,2125900,3054.421209
696,India,Asia,1952,37.373,372000000,546.565749
708,Indonesia,Asia,1952,37.468,82052000,749.681655
720,Iran,Asia,1952,44.869,17272000,3035.326002
732,Iraq,Asia,1952,45.32,5441766,4129.766056


# 데이터 프레임 기본 기능

In [19]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [20]:
df.head(n = 10)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
5,Afghanistan,Asia,1977,38.438,14880372,786.11336
6,Afghanistan,Asia,1982,39.854,12881816,978.011439
7,Afghanistan,Asia,1987,40.822,13867957,852.395945
8,Afghanistan,Asia,1992,41.674,16317921,649.341395
9,Afghanistan,Asia,1997,41.763,22227415,635.341351


In [21]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [22]:
df.tail(n = 7)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1697,Zimbabwe,Africa,1977,57.674,6642107,685.587682
1698,Zimbabwe,Africa,1982,60.363,7636524,788.855041
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


In [23]:
df.shape

(1704, 6)

In [24]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

In [25]:
df.dtypes    # object는 문자열 int64는 64비트로 이루어진 정수형

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [26]:
df.info()    # 모든 열이 1704개의 행을 가지므로 결측치 없음, 만약 행의 갯수가 다르면 결측치가 있다고 판단

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


# gapminder.tsv 파일을 판다스를 통해 데이터프레임으로 가져오고, 출생년도가 1957년도인 행만 추출하고 인증하기

In [27]:
import pandas as pd

df = pd.read_csv("gapminder.tsv", sep = "\t")

In [32]:
df[df['year'] == 1957]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
13,Albania,Europe,1957,59.280,1476505,1942.284244
25,Algeria,Africa,1957,45.685,10270856,3013.976023
37,Angola,Africa,1957,31.999,4561361,3827.940465
49,Argentina,Americas,1957,64.399,19610538,6856.856212
...,...,...,...,...,...,...
1645,Vietnam,Asia,1957,42.887,28998543,676.285448
1657,West Bank and Gaza,Asia,1957,45.671,1070439,1827.067742
1669,"Yemen, Rep.",Asia,1957,33.970,5498090,804.830455
1681,Zambia,Africa,1957,44.077,3016000,1311.956766
