### SETUP

In [None]:
import pandas as pd

# Loading and Exploring Data

## Simple Exploring Data

We will use `cities world` data. However, we will load again.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/FTDS-learning-materials/phase-0/main/src/cities_world.tsv', delimiter='\t')

**Show First and Last 5 rows**

In [None]:
df.head()

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5


In [None]:
df.tail()

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
120,Copenhagen,55.6761,12.5683,Denmark,1525000,816,1850,121
121,Brisbane,-27.4698,153.0251,Australia,1508000,1603,950,122
122,Riverside/San Bernardino,33.9533,-117.3962,USA,1507000,1136,1350,123
123,Cincinnati,39.1031,-84.512,USA,1503000,1740,850,124
124,Accra,5.6037,-0.187,Ghana,1500000,453,3300,125


**Show number of rows and columns in a DataFrame**

Thw output will be a Tuple -> (n_rows,n_cols)

In [None]:
df.shape

(125, 8)

**Show DataFrame Summary**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        125 non-null    object 
 1   Latitude    125 non-null    float64
 2   Longitude   125 non-null    float64
 3   Country     125 non-null    object 
 4   Population  125 non-null    int64  
 5   Land_area   125 non-null    int64  
 6   Density     125 non-null    int64  
 7   Number      125 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 7.9+ KB


**Show descriptive statistics of the numerical columns**


In [None]:
df.describe()

Unnamed: 0,Latitude,Longitude,Population,Land_area,Density,Number
count,125.0,125.0,125.0,125.0,125.0,125.0
mean,25.005552,5.56733,5283232.0,1399.624,5403.6,63.0
std,25.394396,80.124674,4855835.0,1340.851906,4935.181371,36.228442
min,-37.8136,-123.1207,1500000.0,376.0,700.0,1.0
25%,14.5995,-74.0059,2100000.0,583.0,2250.0,32.0
50%,33.749,12.4964,3502000.0,816.0,3800.0,63.0
75%,41.8057,67.0099,6000000.0,1740.0,7700.0,94.0
max,59.9343,153.0251,33200000.0,8683.0,29650.0,125.0


**Show Column List**

In [None]:
df.columns

Index(['City', 'Latitude', 'Longitude', 'Country', 'Population', 'Land_area',
       'Density', 'Number'],
      dtype='object')

**Show Index List**

In [None]:
df.index

RangeIndex(start=0, stop=125, step=1)

**Show Data Types Each Column**

In [None]:
df.dtypes

City           object
Latitude      float64
Longitude     float64
Country        object
Population      int64
Land_area       int64
Density         int64
Number          int64
dtype: object

**Show Number of Unique Values Each Column**

In [None]:
df.nunique()

City          125
Latitude      125
Longitude     125
Country        52
Population    109
Land_area      90
Density        96
Number        125
dtype: int64

**Show Frequencty Count of Unique Values in Certain Column**

In [None]:
df['Country'].value_counts()

USA             26
Brazil           9
Germany          6
India            6
China            6
Japan            5
South Africa     4
Saudi Arabia     3
UK               3
Italy            3
Canada           3
Mexico           3
Australia        3
Poland           2
Spain            2
Turkey           2
Pakistan         2
Russia           2
Taiwan           2
Puerto Rico      1
Israel           1
Denmark          1
Portugal         1
Greece           1
Austria          1
Belgium          1
Azerbaijan       1
UAE              1
Singapore        1
Lebanon          1
Hungary          1
Zimbabwe         1
Kuwait           1
Uzbekistan       1
Congo            1
Sudan            1
France           1
South Korea      1
Philippines      1
Indonesia        1
Nigeria          1
Egypt            1
Argentina        1
Iran             1
Malaysia         1
Colombia         1
Peru             1
Thailand         1
Iraq             1
Chile            1
Vietnam          1
Ghana            1
Name: Countr

**Show Unique Values in Certain Column**

In [None]:
df['City'].unique()

array(['Tokyo/Yokohama', 'New York Metro', 'Sao Paulo', 'Seoul/Incheon',
       'Mexico City', 'Osaka/Kobe/Kyoto', 'Manila', 'Mumbai', 'Delhi',
       'Jakarta', 'Lagos', 'Kolkata', 'Cairo', 'Los Angeles',
       'Buenos Aires', 'Rio de Janeiro', 'Moscow', 'Shanghai', 'Karachi',
       'Paris', 'Istanbul', 'Nagoya', 'Beijing', 'Chicago', 'London',
       'Shenzhen', 'Essen/DГјsseldorf', 'Tehran', 'Bogota', 'Lima',
       'Bangkok', 'Johannesburg/East Rand', 'Chennai', 'Taipei',
       'Baghdad', 'Santiago', 'Bangalore', 'Hyderabad', 'St Petersburg',
       'Philadelphia', 'Lahore', 'Kinshasa', 'Miami', 'Ho Chi Minh City',
       'Madrid', 'Tianjin', 'Kuala Lumpur', 'Toronto', 'Milan',
       'Shenyang', 'Dallas/Fort Worth', 'Boston', 'Belo Horizonte',
       'Khartoum', 'Riyadh', 'Singapore', 'Washington', 'Detroit',
       'Barcelona', 'Houston', 'Athens', 'Berlin', 'Sydney', 'Atlanta',
       'Guadalajara', 'San Francisco/Oakland', 'Montreal', 'Monterey',
       'Melbourne', 'Ankara'

# Basic Data Manipulation

## Query/Filtering

### Boolean Indexing

#### Single Condition

Retrieve data that has population more than 15 million people

In [None]:
df[df['Population']>15000000]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
5,Osaka/Kobe/Kyoto,34.6937,135.5022,Japan,16425000,2564,6400,6


#### Multiple Conditions

Cities that has more than 10 million of population in Southern Hemisphere

In [None]:
df[(df['Population']>10000000) & (df['Latitude']<0)]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
9,Jakarta,-6.1745,106.8227,Indonesia,14250000,1360,10500,10
14,Buenos Aires,-34.6037,-58.3816,Argentina,11200000,2266,4950,15
15,Rio de Janeiro,-22.9068,-43.1729,Brazil,10800000,1580,6850,16


### Query Method

In [None]:
df.query('Population > 15000000')

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
5,Osaka/Kobe/Kyoto,34.6937,135.5022,Japan,16425000,2564,6400,6


In [None]:
df.query('Population > 10000000 and Latitude < 0')

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
9,Jakarta,-6.1745,106.8227,Indonesia,14250000,1360,10500,10
14,Buenos Aires,-34.6037,-58.3816,Argentina,11200000,2266,4950,15
15,Rio de Janeiro,-22.9068,-43.1729,Brazil,10800000,1580,6850,16


### isin Method

Retrieve list City only in Argentina, Brazil, and Chile:

In [None]:
df[df['Country'].isin(['Argentina','Brazil','Chile'])]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
14,Buenos Aires,-34.6037,-58.3816,Argentina,11200000,2266,4950,15
15,Rio de Janeiro,-22.9068,-43.1729,Brazil,10800000,1580,6850,16
35,Santiago,-33.4489,-70.6693,Chile,5425000,648,8400,36
52,Belo Horizonte,-19.9245,-43.9352,Brazil,4000000,868,4600,53
70,Recife,-8.0476,-34.877,Brazil,3025000,376,8050,71
73,Porto Alegre,-30.0347,-51.2177,Brazil,2800000,583,4800,74
79,Fortaleza,-3.7319,-38.5267,Brazil,2650000,583,4550,80
80,Curitiba,-25.4244,-49.2654,Brazil,2500000,648,3850,81
110,Campinas,-22.9099,-47.0626,Brazil,1750000,492,3550,111


### str Method

Retrieve Cities that contains 'City' on their name:

In [None]:
df[df['City'].str.contains('City')]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
43,Ho Chi Minh City,10.8231,106.6297,Vietnam,4900000,518,9450,44


### .loc Method

In [None]:
df.loc[(df['Population']>10000000)]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
5,Osaka/Kobe/Kyoto,34.6937,135.5022,Japan,16425000,2564,6400,6
6,Manila,14.5995,120.9842,Philippines,14750000,1399,10550,7
7,Mumbai,19.076,72.8777,India,14350000,484,29650,8
8,Delhi,28.7041,77.1025,India,14300000,1295,11050,9
9,Jakarta,-6.1745,106.8227,Indonesia,14250000,1360,10500,10


## Add Column and Row

### Add a New Column

In [None]:
df['New_Pop'] = df['Land_area'] * df['Density']
df.head()

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number,New_Pop
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1,33216750
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2,17800150
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3,17712000
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4,17518300
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5,17404800


### Add a New Row

In [None]:
data = {
    'City': 'Surabaya',
    'Latitude': -7.2575,
    'Longitude': 112.7521,
    'Country': 'Indonesia',
    'Population': 2949585,
    'Land_area': 351,
    'Density': 8406,
    'Number':126
}
df = df.append(data,ignore_index=True)
df.tail()

  df = df.append(data,ignore_index=True)


Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number,New_Pop
121,Brisbane,-27.4698,153.0251,Australia,1508000,1603,950,122,1522850.0
122,Riverside/San Bernardino,33.9533,-117.3962,USA,1507000,1136,1350,123,1533600.0
123,Cincinnati,39.1031,-84.512,USA,1503000,1740,850,124,1479000.0
124,Accra,5.6037,-0.187,Ghana,1500000,453,3300,125,1494900.0
125,Surabaya,-7.2575,112.7521,Indonesia,2949585,351,8406,126,


## Remove Columns and Rows

### Remove Columns

In [None]:
df.drop(columns='New_Pop') #or df.drop('New_Pop', axis = 1)
# you can add inplace=True argument into the function to overwrite the old variable

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.9780,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
...,...,...,...,...,...,...,...,...
121,Brisbane,-27.4698,153.0251,Australia,1508000,1603,950,122
122,Riverside/San Bernardino,33.9533,-117.3962,USA,1507000,1136,1350,123
123,Cincinnati,39.1031,-84.5120,USA,1503000,1740,850,124
124,Accra,5.6037,-0.1870,Ghana,1500000,453,3300,125


In [None]:
#We don't add inplace=True or don't running a syntax like this: df = df.drop(columns='New_Pop')
#So, if we call the dataframe, 'Num_Pop' still there

df.head()

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number,New_Pop
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1,33216750.0
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2,17800150.0
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3,17712000.0
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4,17518300.0
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5,17404800.0


In [None]:
# But not if we use 'del' keyword, it changes the variable automatically and directly
# without assign a new variable or inplace argument
del df['New_Pop']

In [None]:
df.head()

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.978,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5


### Remove Rows

### Boolean Indexing

In [None]:
df[~(df['City']=='Surabaya')]

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.9780,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
...,...,...,...,...,...,...,...,...
120,Copenhagen,55.6761,12.5683,Denmark,1525000,816,1850,121
121,Brisbane,-27.4698,153.0251,Australia,1508000,1603,950,122
122,Riverside/San Bernardino,33.9533,-117.3962,USA,1507000,1136,1350,123
123,Cincinnati,39.1031,-84.5120,USA,1503000,1740,850,124


### Using drop()

In [None]:
df.drop(index=125)

Unnamed: 0,City,Latitude,Longitude,Country,Population,Land_area,Density,Number
0,Tokyo/Yokohama,35.6895,139.6917,Japan,33200000,6993,4750,1
1,New York Metro,40.7128,-74.0059,USA,17800000,8683,2050,2
2,Sao Paulo,-23.5505,-46.6333,Brazil,17700000,1968,9000,3
3,Seoul/Incheon,37.5665,126.9780,South Korea,17500000,1049,16700,4
4,Mexico City,23.6345,-102.5528,Mexico,17400000,2072,8400,5
...,...,...,...,...,...,...,...,...
120,Copenhagen,55.6761,12.5683,Denmark,1525000,816,1850,121
121,Brisbane,-27.4698,153.0251,Australia,1508000,1603,950,122
122,Riverside/San Bernardino,33.9533,-117.3962,USA,1507000,1136,1350,123
123,Cincinnati,39.1031,-84.5120,USA,1503000,1740,850,124


# Grouping and Aggregation

## Groupby Based on a Single Column

In [None]:
# One aggregation function

df.groupby('Country').mean()

  df.groupby('Country').mean()


Unnamed: 0_level_0,Latitude,Longitude,Population,Land_area,Density,Number
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Argentina,-34.6037,-58.3816,11200000.0,2266.0,4950.0,15.0
Australia,-33.050733,149.7325,2724000.0,1790.0,1516.666667,84.666667
Austria,48.2082,16.3738,1550000.0,453.0,3400.0,118.0
Azerbaijan,40.4093,49.8671,2100000.0,544.0,3850.0,94.0
Belgium,50.8503,4.3517,1570000.0,712.0,2200.0,117.0
Brazil,-19.147167,-44.730333,5205556.0,853.444444,5338.888889,66.888889
Canada,46.145867,-92.023733,3137667.0,1505.0,2050.0,73.666667
Chile,-33.4489,-70.6693,5425000.0,648.0,8400.0,36.0
China,35.580267,119.031033,6385667.0,542.5,11483.333333,39.666667
Colombia,4.711,-74.0721,7000000.0,518.0,13500.0,29.0


In [None]:
df.groupby('Country').agg(['count','mean']).head()

  df.groupby('Country').agg(['count','mean']).head()


Unnamed: 0_level_0,Latitude,Latitude,Longitude,Longitude,Population,Population,Land_area,Land_area,Density,Density,Number,Number
Unnamed: 0_level_1,count,mean,count,mean,count,mean,count,mean,count,mean,count,mean
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Argentina,1,-34.6037,1,-58.3816,1,11200000.0,1,2266.0,1,4950.0,1,15.0
Australia,3,-33.050733,3,149.7325,3,2724000.0,3,1790.0,3,1516.666667,3,84.666667
Austria,1,48.2082,1,16.3738,1,1550000.0,1,453.0,1,3400.0,1,118.0
Azerbaijan,1,40.4093,1,49.8671,1,2100000.0,1,544.0,1,3850.0,1,94.0
Belgium,1,50.8503,1,4.3517,1,1570000.0,1,712.0,1,2200.0,1,117.0


In [None]:
df.groupby('Country').agg({'City':'count','Population':'mean','Density':'max'}).head()

Unnamed: 0_level_0,City,Population,Density
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,1,11200000.0,4950
Australia,3,2724000.0,2100
Austria,1,1550000.0,3400
Azerbaijan,1,2100000.0,3850
Belgium,1,1570000.0,2200


## Groupby Based on Multiple Columns

In [None]:
teams

Unnamed: 0,division,conference,school_name,roster_url,id
0,FBS (Division I-A Teams),American Athletic,Cincinnati,http://espn.go.com/ncf/teams/roster?teamId=2132,1
1,FBS (Division I-A Teams),American Athletic,Connecticut,http://espn.go.com/ncf/teams/roster?teamId=41,2
2,FBS (Division I-A Teams),American Athletic,Houston,http://espn.go.com/ncf/teams/roster?teamId=248,3
3,FBS (Division I-A Teams),American Athletic,Louisville,http://espn.go.com/ncf/teams/roster?teamId=97,4
4,FBS (Division I-A Teams),American Athletic,Memphis,http://espn.go.com/ncf/teams/roster?teamId=235,5
...,...,...,...,...,...
247,FCS (Division I-AA Teams),SWAC,Jackson State,http://espn.go.com/ncf/teams/roster?teamId=2296,248
248,FCS (Division I-AA Teams),SWAC,Mississippi Valley State,http://espn.go.com/ncf/teams/roster?teamId=2400,249
249,FCS (Division I-AA Teams),SWAC,Prairie View A&M,http://espn.go.com/ncf/teams/roster?teamId=2504,250
250,FCS (Division I-AA Teams),SWAC,Southern University,http://espn.go.com/ncf/teams/roster?teamId=2582,251


In [None]:
teams.groupby(['division','conference']).count()['school_name']

division                   conference       
FBS (Division I-A Teams)   ACC                  14
                           American Athletic    10
                           Big 12               10
                           Big Ten              12
                           Conference USA       14
                           FBS Independents      7
                           Mid-American         13
                           Mountain West        12
                           Pac-12               12
                           SEC                  14
                           Sun Belt              8
FCS (Division I-AA Teams)  Big Sky              13
                           Big South             6
                           CAA                  11
                           FCS Independents      5
                           Ivy                   8
                           MEAC                 11
                           Missouri Valley      10
                           Northeast 