## 다양한 메소드 사용해보기

In [2]:
import pandas as pd

df = pd.read_csv('../data/gapminder.tsv', sep='\t')

### 1. append

In [3]:
print(df)

          country continent  year  lifeExp       pop   gdpPercap
0     Afghanistan      Asia  1952   28.801   8425333  779.445314
1     Afghanistan      Asia  1957   30.332   9240934  820.853030
2     Afghanistan      Asia  1962   31.997  10267083  853.100710
3     Afghanistan      Asia  1967   34.020  11537966  836.197138
4     Afghanistan      Asia  1972   36.088  13079460  739.981106
...           ...       ...   ...      ...       ...         ...
1699     Zimbabwe    Africa  1987   62.351   9216418  706.157306
1700     Zimbabwe    Africa  1992   60.377  10704340  693.420786
1701     Zimbabwe    Africa  1997   46.809  11404948  792.449960
1702     Zimbabwe    Africa  2002   39.989  11926563  672.038623
1703     Zimbabwe    Africa  2007   43.487  12311143  469.709298

[1704 rows x 6 columns]


In [7]:
# 각 모든 행의 continent가 Asia가 맞는지 확인
print(df['continent'] == 'Asia')
# 각 행에 해당하는 불린 데이터를 이용하여 원하는 행만 추출
# 이것을 불린 추출이라고한다.
df_asia = df.loc[df['continent'] == 'Asia', :]
# 총 396개의 행이 추출된다.
print(df_asia.shape)

df_africa = df.loc[df['continent'] == 'Africa', :]
print(df_africa.shape)

0        True
1        True
2        True
3        True
4        True
        ...  
1699    False
1700    False
1701    False
1702    False
1703    False
Name: continent, Length: 1704, dtype: bool
(396, 6)
(624, 6)


In [8]:
# Asia만 추출한 데이터프레임에 아프리카만 추출한 데이터프레임을 연결한다.
df_aa = df_asia.append(df_africa)
df_aa

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.853030
2,Afghanistan,Asia,1962,31.997,10267083,853.100710
3,Afghanistan,Asia,1967,34.020,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106
...,...,...,...,...,...,...
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.449960
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623


### 2. describe

In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None


In [10]:
# gapminder 데이터 프레임에서 정수/실수인 열만 가져와서 요약 통계량 확인
df_nums = df.iloc[:, 2:]
df.describe() # == df_nums.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165877
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846989
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


### 3. drop_duplicates

In [13]:
cnt = df['country']
print(cnt.head(n=20))

0     Afghanistan
1     Afghanistan
2     Afghanistan
3     Afghanistan
4     Afghanistan
5     Afghanistan
6     Afghanistan
7     Afghanistan
8     Afghanistan
9     Afghanistan
10    Afghanistan
11    Afghanistan
12        Albania
13        Albania
14        Albania
15        Albania
16        Albania
17        Albania
18        Albania
19        Albania
Name: country, dtype: object


In [14]:
# 중복인 행 제거
cnt_list = cnt.drop_duplicates()
cnt_list

0              Afghanistan
12                 Albania
24                 Algeria
36                  Angola
48               Argentina
               ...        
1644               Vietnam
1656    West Bank and Gaza
1668           Yemen, Rep.
1680                Zambia
1692              Zimbabwe
Name: country, Length: 142, dtype: object

### 4. equals (시리즈/데이터프레임이 동일한 값을 가지고 있는지 확인)

In [18]:
print(cnt_list.equals(cnt_list))
print(cnt_list.equals(cnt))

True
False


### 5. values

In [23]:
# 모든 행의 값들을 출력
print(cnt_list.values)
print(df['pop'].values)

['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Australia'
 'Austria' 'Bahrain' 'Bangladesh' 'Belgium' 'Benin' 'Bolivia'
 'Bosnia and Herzegovina' 'Botswana' 'Brazil' 'Bulgaria' 'Burkina Faso'
 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Central African Republic'
 'Chad' 'Chile' 'China' 'Colombia' 'Comoros' 'Congo, Dem. Rep.'
 'Congo, Rep.' 'Costa Rica' "Cote d'Ivoire" 'Croatia' 'Cuba'
 'Czech Republic' 'Denmark' 'Djibouti' 'Dominican Republic' 'Ecuador'
 'Egypt' 'El Salvador' 'Equatorial Guinea' 'Eritrea' 'Ethiopia' 'Finland'
 'France' 'Gabon' 'Gambia' 'Germany' 'Ghana' 'Greece' 'Guatemala' 'Guinea'
 'Guinea-Bissau' 'Haiti' 'Honduras' 'Hong Kong, China' 'Hungary' 'Iceland'
 'India' 'Indonesia' 'Iran' 'Iraq' 'Ireland' 'Israel' 'Italy' 'Jamaica'
 'Japan' 'Jordan' 'Kenya' 'Korea, Dem. Rep.' 'Korea, Rep.' 'Kuwait'
 'Lebanon' 'Lesotho' 'Liberia' 'Libya' 'Madagascar' 'Malawi' 'Malaysia'
 'Mali' 'Mauritania' 'Mauritius' 'Mexico' 'Mongolia' 'Montenegro'
 'Morocco' 'Mozambique' 'Myanmar

### 6. isin

In [24]:
# 데이터프레임의 country열에 해당하는 값이 인자로 들어간
# iterable/series/dataframe의 값 안에 들어있는지 판단해준다.

ft = df['country'].isin(['Togo', 'Zambia'])
print(ft)

0       False
1       False
2       False
3       False
4       False
        ...  
1699    False
1700    False
1701    False
1702    False
1703    False
Name: country, Length: 1704, dtype: bool


In [28]:
# 판단의 결과값을 불린 추출로 이용할 수 있다.
df.loc[ft, :]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1536,Togo,Africa,1952,38.596,1219113,859.808657
1537,Togo,Africa,1957,41.208,1357445,925.90832
1538,Togo,Africa,1962,43.922,1528098,1067.53481
1539,Togo,Africa,1967,46.769,1735550,1477.59676
1540,Togo,Africa,1972,49.759,2056351,1649.660188
1541,Togo,Africa,1977,52.887,2308582,1532.776998
1542,Togo,Africa,1982,55.471,2644765,1344.577953
1543,Togo,Africa,1987,56.941,3154264,1202.201361
1544,Togo,Africa,1992,58.061,3747553,1034.298904
1545,Togo,Africa,1997,58.39,4320890,982.286924


#### 추가정보: pandas에서는 데이터에 대해서 or, and, not 대신 |, &, ~를 사용하며 괄호를 반드시 사용해야한다.

In [35]:
# or를 이용하여 위에 isin과 동일한 결과를 출력
print(df.loc[(df['country'] == 'Togo') | (df['country'] == 'Zambia'), :])
print("\n")
# or 응용 예시
print(df.loc[(df['country']=='Korea')|(df['country']=='South Korea')|(df['country']=='Republic of Korea'), :])

     country continent  year  lifeExp       pop    gdpPercap
1536    Togo    Africa  1952   38.596   1219113   859.808657
1537    Togo    Africa  1957   41.208   1357445   925.908320
1538    Togo    Africa  1962   43.922   1528098  1067.534810
1539    Togo    Africa  1967   46.769   1735550  1477.596760
1540    Togo    Africa  1972   49.759   2056351  1649.660188
1541    Togo    Africa  1977   52.887   2308582  1532.776998
1542    Togo    Africa  1982   55.471   2644765  1344.577953
1543    Togo    Africa  1987   56.941   3154264  1202.201361
1544    Togo    Africa  1992   58.061   3747553  1034.298904
1545    Togo    Africa  1997   58.390   4320890   982.286924
1546    Togo    Africa  2002   57.561   4977378   886.220577
1547    Togo    Africa  2007   58.420   5701579   882.969944
1680  Zambia    Africa  1952   42.038   2672000  1147.388831
1681  Zambia    Africa  1957   44.077   3016000  1311.956766
1682  Zambia    Africa  1962   46.023   3421000  1452.725766
1683  Zambia    Africa  

### 7. replace (주의: in-place modification이 아니다. 즉, 수정된 새로운 데이터 프레임을 가져온다.)

In [36]:
# Korea, Rep. 에 해당하는 나라를 불린 추출한다.
df.loc[(df['country']=='Korea, Rep.'), :]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
840,"Korea, Rep.",Asia,1952,47.453,20947571,1030.592226
841,"Korea, Rep.",Asia,1957,52.681,22611552,1487.593537
842,"Korea, Rep.",Asia,1962,55.292,26420307,1536.344387
843,"Korea, Rep.",Asia,1967,57.716,30131000,2029.228142
844,"Korea, Rep.",Asia,1972,62.612,33505000,3030.87665
845,"Korea, Rep.",Asia,1977,64.766,36436000,4657.22102
846,"Korea, Rep.",Asia,1982,67.123,39326000,5622.942464
847,"Korea, Rep.",Asia,1987,69.81,41622000,8533.088805
848,"Korea, Rep.",Asia,1992,72.244,43805450,12104.27872
849,"Korea, Rep.",Asia,1997,74.647,46173816,15993.52796


In [37]:
# 불린 추출한 결과에서 나라 명을 바꾼 새로운 데이터 프레임을 얻는다.
df_new = df.replace('Korea, Rep.', 'South Korea')
df_new.iloc[840:852, :]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
840,South Korea,Asia,1952,47.453,20947571,1030.592226
841,South Korea,Asia,1957,52.681,22611552,1487.593537
842,South Korea,Asia,1962,55.292,26420307,1536.344387
843,South Korea,Asia,1967,57.716,30131000,2029.228142
844,South Korea,Asia,1972,62.612,33505000,3030.87665
845,South Korea,Asia,1977,64.766,36436000,4657.22102
846,South Korea,Asia,1982,67.123,39326000,5622.942464
847,South Korea,Asia,1987,69.81,41622000,8533.088805
848,South Korea,Asia,1992,72.244,43805450,12104.27872
849,South Korea,Asia,1997,74.647,46173816,15993.52796


### 8. sample (무작위추출)

In [38]:
df['pop'].sample(n=10)

293     943455000
1225     28235346
724      30614000
1254      3279001
1306       170372
1198      5884491
864       1439529
1428      7982342
1393      2780415
78        7574613
Name: pop, dtype: int64

### 9. sort_values(by=기준, axis=행(0) 또는 열(1) 정렬하기)

In [39]:
# 2007년을 인구 수로 대로 정렬하기 위해 2007년 자료를 추출
print(df.info())
data_2007 = df.loc[df['year'] == 2007, :]
data_2007

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None


Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
11,Afghanistan,Asia,2007,43.828,31889923,974.580338
23,Albania,Europe,2007,76.423,3600523,5937.029526
35,Algeria,Africa,2007,72.301,33333216,6223.367465
47,Angola,Africa,2007,42.731,12420476,4797.231267
59,Argentina,Americas,2007,75.320,40301927,12779.379640
...,...,...,...,...,...,...
1655,Vietnam,Asia,2007,74.249,85262356,2441.576404
1667,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798
1679,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906
1691,Zambia,Africa,2007,42.384,11746035,1271.211593


In [40]:
# 추출한 자료를 인구 수로 정렬
# axis=0은 행 axis=1는 열을 의미
data_2007.sort_values(by=['pop'], axis=0)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1307,Sao Tome and Principe,Africa,2007,65.528,199579,1598.435089
695,Iceland,Europe,2007,81.757,301931,36180.789190
431,Djibouti,Africa,2007,54.791,496374,2082.481567
491,Equatorial Guinea,Africa,2007,51.579,551201,12154.089750
1019,Montenegro,Europe,2007,74.543,684736,9253.896111
...,...,...,...,...,...,...
179,Brazil,Americas,2007,72.390,190010647,9065.800825
719,Indonesia,Asia,2007,70.650,223547000,3540.651564
1619,United States,Americas,2007,78.242,301139947,42951.653090
707,India,Asia,2007,64.698,1110396331,2452.210407
