### Pandas Series
A series is one-dimenional labelled array capable of holding any data type. The axis labels are collectively called the index( numpy makes horizontal arrays where as vertical in pandas)

### Creating Series
There are multiple ways to create a pandas Series

In [2]:
# pip install pandas

In [3]:
import numpy as np
import pandas as pd

In [4]:
labels = ['a','b','c']#indices
my_list = [10,20,30]
my_array = np.array([10,20,30])
d = {1:10,2:20,3:30}

In [5]:
pd.Series(my_list)
#0    10
#1    20
#2    30

# labels and data points

0    10
1    20
2    30
dtype: int64

In [6]:
pd.Series(my_list , index = labels)
#a    10
#b    20
#c    30

a    10
b    20
c    30
dtype: int64

In [7]:
pd.Series(my_array , index = labels)
# if we add 2d array then array as series is 1d

a    10
b    20
c    30
dtype: int64

In [8]:
pd.Series(d)
# as dictionary therefore index is key and value is value
#1    10
#2    20
#3    30

1    10
2    20
3    30
dtype: int64

### Dataframes
multiple series when combined i.e multiple columns in a series

In [9]:
import numpy as np
import pandas as pd

#### Creating Dataframe

In [10]:
data = {
    'Name' : ['John','Anna','Peter','Linda'],
    'Age'  : [28,34,29,42],
    'City' : ['New York','Paris','Berlin','London'],
    'Salary' : [65000 , 70000 , 62000 , 85000]
    
}
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [12]:
data_list = [
    ['John' , 28 , 'New York' , 65000],
    ['Anna',	34	,'Paris',	70000],
    ['Peter' ,29	,'Berlin',	62000],
    ['Linda',	42,	'London',	85000]
    
]
df2 = pd.DataFrame(data_list)
columns = ['Name' , 'Age' , 'City' , 'Salary']

In [13]:
df2
# column names are not defined taken as 0,1,2,3...

Unnamed: 0,0,1,2,3
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [14]:
df3 = pd.DataFrame(data_list , columns = columns)

In [15]:
df3

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [16]:
df3['Name']
#0     John
#1     Anna
#2    Peter
#3    Linda
#Name: Name, dtype: object

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object

In [17]:
df3[['Name' ,'City' ]]
# multiple columns we use list of list [[]]

Unnamed: 0,Name,City
0,John,New York
1,Anna,Paris
2,Peter,Berlin
3,Linda,London


#### Creating a new column

In [18]:
df3['Designation'] = ["Doctor" , "Engineer" , "Doctor" , "Engineer"]

In [19]:
df3

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Engineer
2,Peter,29,Berlin,62000,Doctor
3,Linda,42,London,85000,Engineer


#### Remove the column

In [20]:
df3.drop('Designation',axis = 1)
# we need to give axis as by default it is 0 i.e row

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [21]:
# if no inplace therefore only snapshot i.e copy not original no change in original
# therfore use inplace is true must be there to drop permanently
df3

Unnamed: 0,Name,Age,City,Salary,Designation
0,John,28,New York,65000,Doctor
1,Anna,34,Paris,70000,Engineer
2,Peter,29,Berlin,62000,Doctor
3,Linda,42,London,85000,Engineer


In [22]:
df3.drop("Designation",axis = 1, inplace = True)

In [23]:
df3

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [24]:
df3.drop(0,axis = 0)
# remove the row with oth index

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [25]:
df3.drop(0,axis = 0 , inplace = True)

In [26]:
df3

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


#### Selected Rows

In [27]:
data_list = [
    ['John' , 28 , 'New York' , 65000],
    ['Anna',	34	,'Paris',	70000],
    ['Peter' ,29	,'Berlin',	62000],
    ['Linda',	42,	'London',	85000]
    
]
df2 = pd.DataFrame(data_list)
columns = ['Name' , 'Age' , 'City' , 'Salary']
df3 = pd.DataFrame(data_list , columns = columns)

In [28]:
df3.loc[[0,1]]
# oth and 1st row

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000


In [29]:
df3.iloc[3]
# if something else other than 0,1,2,3... then iloc

Name       Linda
Age           42
City      London
Salary     85000
Name: 3, dtype: object

#### Selecting Subsets of Rows and columns

In [30]:
df3.loc[[0,1]][['City','Salary']]

Unnamed: 0,City,Salary
0,New York,65000
1,Paris,70000


#### Conditional Selection

In [31]:
# I only want to see people with age above 39
print(df3['Age']>30)
df3[df3['Age']>30]

0    False
1     True
2    False
3     True
Name: Age, dtype: bool


Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
3,Linda,42,London,85000


In [32]:
# people age greater than 30 and city paris
print(df3['Age']>30)
print(df3['City']=='Paris')

0    False
1     True
2    False
3     True
Name: Age, dtype: bool
0    False
1     True
2    False
3    False
Name: City, dtype: bool


In [33]:
df3[(df3['Age']>30) & (df3['City']=='Paris')]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000


#### Missing Data

In [34]:
import numpy as np
import pandas as pd

In [35]:
data = {
    'A':[1,2,np.nan,4,5],
    'B':[np.nan,2,3,4,5],
    'C':[1,2,3,np.nan,np.nan],
    'D':[1,np.nan,np.nan,np.nan,5]
}
#nan is null values

In [36]:
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1.0
1,2.0,2.0,2.0,
2,,3.0,3.0,
3,4.0,4.0,,
4,5.0,5.0,,5.0


In [37]:
df.isna()
# true and false in entire data frame
#A	B	C	D
#0	False	True	False	False
#1	False	False	False	True
#2	True	False	False	True
#3	False	False	True	True
#4	False	False	True	False


Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [38]:
df.isna().sum()
#A    1
#B    1
#C    2
#D    3
#dtype: int64
# A me 1 na value , B me 2 , C me 3

A    1
B    1
C    2
D    3
dtype: int64

In [39]:
df.isna().any()
#A    True
#B    True
#C    True
#D    True
#dtype: bool
# tells that each column has atleast one na value

A    True
B    True
C    True
D    True
dtype: bool

#### Remove null values

In [40]:
df

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1.0
1,2.0,2.0,2.0,
2,,3.0,3.0,
3,4.0,4.0,,
4,5.0,5.0,,5.0


In [41]:
df.dropna()
# drops the rows with atleast one na value therefore
#  since each row has some na value therefore entire 
# df drops

Unnamed: 0,A,B,C,D


In [42]:
df

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1.0
1,2.0,2.0,2.0,
2,,3.0,3.0,
3,4.0,4.0,,
4,5.0,5.0,,5.0


In [43]:
df.dropna(thresh = 3)
# if atleast 3 non null in a row values then keep else drop

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1.0
1,2.0,2.0,2.0,
4,5.0,5.0,,5.0


In [44]:
df.dropna(thresh = 2)

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1.0
1,2.0,2.0,2.0,
2,,3.0,3.0,
3,4.0,4.0,,
4,5.0,5.0,,5.0


#### Filling the missing data

In [45]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,0.0,1.0,1.0
1,2.0,2.0,2.0,0.0
2,0.0,3.0,3.0,0.0
3,4.0,4.0,0.0,0.0
4,5.0,5.0,0.0,5.0


In [46]:
values = {'A':0 , 'B':100, 'C':300 ,'D':400}

In [47]:
values

{'A': 0, 'B': 100, 'C': 300, 'D': 400}

In [48]:
df.fillna(value  = values)
# missing values in A column filed with 0 , with B is 100

Unnamed: 0,A,B,C,D
0,1.0,100.0,1.0,1.0
1,2.0,2.0,2.0,400.0
2,0.0,3.0,3.0,400.0
3,4.0,4.0,300.0,400.0
4,5.0,5.0,300.0,5.0


In [49]:
df.fillna(df.mean())
# mean of entire df

Unnamed: 0,A,B,C,D
0,1.0,3.5,1.0,1.0
1,2.0,2.0,2.0,3.0
2,3.0,3.0,3.0,3.0
3,4.0,4.0,2.0,3.0
4,5.0,5.0,2.0,5.0


#### Merging joining and concatination

In [50]:
import numpy as np
import pandas as pd

#### Merge two data frames

In [51]:
employees = pd.DataFrame(
    {
        'employee_id' : [1,2,3,4,5],
        'name':['John','Anna','Peter','Linda','Bob'],
        'department' : ['HR','IT','Finance','IT','HR']
    }
)

salaries = pd.DataFrame(
    {
        'employee_id' :[1,2,3,6,7],
        'salary':[60000,80000,65000,70000,90000],
        'bonus':[5000,10000,7000,8000,12000]
    }
)

In [52]:
employees

Unnamed: 0,employee_id,name,department
0,1,John,HR
1,2,Anna,IT
2,3,Peter,Finance
3,4,Linda,IT
4,5,Bob,HR


In [53]:
salaries

Unnamed: 0,employee_id,salary,bonus
0,1,60000,5000
1,2,80000,10000
2,3,65000,7000
3,6,70000,8000
4,7,90000,12000


In [54]:
pd.merge(employees,salaries , on ='employee_id',how = 'inner')
# merge based upon common data here employee_id are 
# common here are employee_id 1,2,3

Unnamed: 0,employee_id,name,department,salary,bonus
0,1,John,HR,60000,5000
1,2,Anna,IT,80000,10000
2,3,Peter,Finance,65000,7000


In [55]:
pd.merge(employees , salaries, on ='employee_id' , how='outer')
# outer is like full outer join in sql missing values is nan

Unnamed: 0,employee_id,name,department,salary,bonus
0,1,John,HR,60000.0,5000.0
1,2,Anna,IT,80000.0,10000.0
2,3,Peter,Finance,65000.0,7000.0
3,4,Linda,IT,,
4,5,Bob,HR,,
5,6,,,70000.0,8000.0
6,7,,,90000.0,12000.0


In [56]:
pd.merge(employees,salaries,on='employee_id',how = 'left')
# left outer join

Unnamed: 0,employee_id,name,department,salary,bonus
0,1,John,HR,60000.0,5000.0
1,2,Anna,IT,80000.0,10000.0
2,3,Peter,Finance,65000.0,7000.0
3,4,Linda,IT,,
4,5,Bob,HR,,


In [57]:
pd.merge(employees,salaries , on = 'employee_id',how='right')
# right outer join

Unnamed: 0,employee_id,name,department,salary,bonus
0,1,John,HR,60000,5000
1,2,Anna,IT,80000,10000
2,3,Peter,Finance,65000,7000
3,6,,,70000,8000
4,7,,,90000,12000


#### Concatination of 2 dataframes

In [58]:

df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2'],
    'B': ['B0', 'B1', 'B2'],
    'C': ['C0', 'C1', 'C2']
})

df2 = pd.DataFrame({
    'A': ['A3', 'A4', 'A5'],
    'B': ['B3', 'B4', 'B5'],
    'C': ['C3', 'C4', 'C5']
})

In [59]:
df1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [60]:
df2

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [61]:
pd.concat([df1,df2])
# df1 upar and df2 neeche

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5


In [62]:
pd.concat([df2,df1])
# by default columns

Unnamed: 0,A,B,C
0,A3,B3,C3
1,A4,B4,C4
2,A5,B5,C5
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [63]:
pd.concat([df1,df2],axis=1)
# based upon rows

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,A0,B0,C0,A3,B3,C3
1,A1,B1,C1,A4,B4,C4
2,A2,B2,C2,A5,B5,C5


**Joining two data frames**

In [64]:
df1 = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie']
}, index=[1, 2, 3])

# Second DataFrame
df2 = pd.DataFrame({
    'score': [85, 90, 75]
}, index=[2, 3, 4])

In [65]:
df1

Unnamed: 0,name
1,Alice
2,Bob
3,Charlie


In [66]:
df2

Unnamed: 0,score
2,85
3,90
4,75


In [67]:
df1.join(df2)
# inner join

Unnamed: 0,name,score
1,Alice,
2,Bob,85.0
3,Charlie,90.0


In [68]:
df1.join(df2,how='inner')

Unnamed: 0,name,score
2,Bob,85
3,Charlie,90


In [69]:
df1.join(df2,how='outer')

Unnamed: 0,name,score
1,Alice,
2,Bob,85.0
3,Charlie,90.0
4,,75.0


In [70]:
df2.join(df1)

Unnamed: 0,score,name
2,85,Bob
3,90,Charlie
4,75,


**Group by and aggregation**

**Group By**

In [71]:
data = {
    'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Store': ['S1', 'S1', 'S2', 'S2', 'S1', 'S2', 'S2', 'S1'],
    'Sales': [100, 200, 150, 250, 120, 180, 200, 300],
    'Quantity': [10, 15, 12, 18, 8, 20, 15, 25],
    'Date': pd.date_range('2023-01-01', periods=8)
}
df = pd.DataFrame(data)

In [72]:
df

Unnamed: 0,Category,Store,Sales,Quantity,Date
0,A,S1,100,10,2023-01-01
1,B,S1,200,15,2023-01-02
2,A,S2,150,12,2023-01-03
3,B,S2,250,18,2023-01-04
4,A,S1,120,8,2023-01-05
5,B,S2,180,20,2023-01-06
6,A,S2,200,15,2023-01-07
7,B,S1,300,25,2023-01-08


In [73]:
# Group by category and calculate sum sales
cat = df.groupby('Category')
cat
for i,v in cat:
    print(i)
    print(v)

A
  Category Store  Sales  Quantity       Date
0        A    S1    100        10 2023-01-01
2        A    S2    150        12 2023-01-03
4        A    S1    120         8 2023-01-05
6        A    S2    200        15 2023-01-07
B
  Category Store  Sales  Quantity       Date
1        B    S1    200        15 2023-01-02
3        B    S2    250        18 2023-01-04
5        B    S2    180        20 2023-01-06
7        B    S1    300        25 2023-01-08


In [74]:
cat_1 = df.groupby('Category')['Sales'].sum()
cat_1

Category
A    570
B    930
Name: Sales, dtype: int64

In [75]:
cat_2 = df.groupby('Store')['Sales'].sum()
cat_2

Store
S1    720
S2    780
Name: Sales, dtype: int64

In [76]:
cat_3  = df.groupby(['Category','Store'])['Sales'].sum()
cat_3

Category  Store
A         S1       220
          S2       350
B         S1       500
          S2       430
Name: Sales, dtype: int64

**Aggregation**

In [77]:
df['Sales'].mean()

np.float64(187.5)

In [78]:
df['Sales'].max()

300

In [79]:
df['Sales'].min()

100

In [80]:
df['Sales'].median()

190.0

In [81]:
df['Sales'].mode()

0    200
Name: Sales, dtype: int64

In [82]:
df['Sales'].count()

np.int64(8)

In [83]:
df['Sales'].sum()

np.int64(1500)

In [84]:
df['Sales'].std()

66.06274074155351

In [85]:
df['Sales'].var()

4364.285714285715

In [86]:
df['Sales'].agg(['mean','median','min','max'])
# in one we can't use mode here

mean      187.5
median    190.0
min       100.0
max       300.0
Name: Sales, dtype: float64

**Pivot Tables and Cross Tabs**

In [87]:
data = {
    'Date': pd.date_range('2023-01-01', periods=20),
    'Product': ['A', 'B', 'C', 'D'] * 5,
    'Region': ['East', 'West', 'North', 'South', 'East', 'West', 'North', 'South', 'East', 'West',
               'North', 'South', 'East', 'West', 'North', 'South', 'East', 'West', 'North', 'South'],
    'Sales': np.random.randint(100, 1000, 20),
    'Units': np.random.randint(10, 100, 20),
    'Rep': ['John', 'Mary', 'Bob', 'Alice', 'John', 'Mary', 'Bob', 'Alice', 'John', 'Mary',
            'Bob', 'Alice', 'John', 'Mary', 'Bob', 'Alice', 'John', 'Mary', 'Bob', 'Alice']
}

df = pd.DataFrame(data)

df['Month'] = df['Date'].dt.month_name()
df['Quarter'] = 'Q' + df['Date'].dt.quarter.astype(str)
df


Unnamed: 0,Date,Product,Region,Sales,Units,Rep,Month,Quarter
0,2023-01-01,A,East,635,38,John,January,Q1
1,2023-01-02,B,West,549,84,Mary,January,Q1
2,2023-01-03,C,North,212,12,Bob,January,Q1
3,2023-01-04,D,South,645,35,Alice,January,Q1
4,2023-01-05,A,East,473,71,John,January,Q1
5,2023-01-06,B,West,743,61,Mary,January,Q1
6,2023-01-07,C,North,801,66,Bob,January,Q1
7,2023-01-08,D,South,665,38,Alice,January,Q1
8,2023-01-09,A,East,538,79,John,January,Q1
9,2023-01-10,B,West,695,49,Mary,January,Q1


In [91]:
pivot = pd.pivot_table(df , values = 'Sales', index = 'Region',columns = 'Product',aggfunc = 'median')
pivot

Product,A,B,C,D
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,590.0,,,
North,,,801.0,
South,,,,665.0
West,,549.0,,


In [92]:
pivot2 = pd.pivot_table(df , values = ['Sales','Units'],index = 'Region',columns = 'Product')

In [93]:
pivot2

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Units,Units,Units,Units
Product,A,B,C,D,A,B,C,D
Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
East,623.2,,,,45.8,,,
North,,,645.6,,,,46.2,
South,,,,585.2,,,,33.8
West,,454.8,,,,68.0,,


In [97]:
pivot_3 = pd.pivot_table(df , values=['Sales','Units'],index = 'Quarter',columns = 'Product')

In [98]:
pivot_3

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Units,Units,Units,Units
Product,A,B,C,D,A,B,C,D
Quarter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Q1,623.2,454.8,645.6,585.2,45.8,68.0,46.2,33.8


**Cross Tabs**

In [100]:
pd.crosstab(df['Region'],df['Product'])
# by default count 
# Region is row and product us col

Product,A,B,C,D
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,5,0,0,0
North,0,0,5,0
South,0,0,0,5
West,0,5,0,0


**DataFrame Basic Operations**

In [102]:

df1 = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})

In [103]:
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [105]:
df1.shape # (row,col)

(5, 3)

In [112]:
df1.columns # tells name of columns

Index(['A', 'B', 'C'], dtype='object')

In [113]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   A       5 non-null      int64
 1   B       5 non-null      int64
 2   C       5 non-null      int64
dtypes: int64(3)
memory usage: 252.0 bytes


In [114]:
df1.describe()

Unnamed: 0,A,B,C
count,5.0,5.0,5.0
mean,3.0,30.0,300.0
std,1.581139,15.811388,158.113883
min,1.0,10.0,100.0
25%,2.0,20.0,200.0
50%,3.0,30.0,300.0
75%,4.0,40.0,400.0
max,5.0,50.0,500.0


In [118]:
df1['A'] +=10 #broadcasting
df1
# every time execute we get +10

Unnamed: 0,A,B,C
0,41,10,100
1,42,20,200
2,43,30,300
3,44,40,400
4,45,50,500


**DataFrame Applying Functions**

In [119]:
df1

Unnamed: 0,A,B,C
0,41,10,100
1,42,20,200
2,43,30,300
3,44,40,400
4,45,50,500


In [120]:
def square(x):
    return x**2

In [126]:
df1['D'] = df1['B'].apply(square)

In [127]:
df1

Unnamed: 0,A,B,C,D
0,41,100,100,10000
1,42,400,200,160000
2,43,900,300,810000
3,44,1600,400,2560000
4,45,2500,500,6250000


In [128]:
import math
df1['E'] = df1['D'].apply(lambda x:math.sqrt(x))

In [129]:
df1

Unnamed: 0,A,B,C,D,E
0,41,100,100,10000,100.0
1,42,400,200,160000,400.0
2,43,900,300,810000,900.0
3,44,1600,400,2560000,1600.0
4,45,2500,500,6250000,2500.0


**Loading Data as csv**

In [133]:
df = pd.read_csv(r'anime.csv')

In [134]:
df.head()

Unnamed: 0,Rank,Title,Score
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05


In [135]:
#make a new column for episode count
#make a new column for time stamp
#which anime has the highest score
#give me top 5 highest scoring anime
#which anime has the highest episode count
#animes with top 5 episode count
#which is the longest running anime

In [140]:
#make a new column for episode count
df.info()
df.loc[1]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rank    50 non-null     int64  
 1   Title   50 non-null     object 
 2   Score   50 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.3+ KB


Rank                                                     2
Title    Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...
Score                                                 9.07
Name: 1, dtype: object

In [141]:
df.loc[2]['Title']

'Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 - Dec 2022474,138 members'

In [150]:
#  (13 eps) like this episodes are written
def extract_episodes(txt):
    check = False
    data = ""
    for i in txt:
        if i == ')':
            check = False
            return data
        if check==True:
            data +=i
        if i == '(':
            check = True
        


        

In [152]:
df["episodes"] = df['Title'].apply(extract_episodes)

In [153]:
df

Unnamed: 0,Rank,Title,Score,episodes
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64 eps
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24 eps
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13 eps
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51 eps
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10 eps
5,6,"Gintama'TV (51 eps)Apr 2011 - Mar 2012534,105 ...",9.04,51 eps
6,7,Gintama: The FinalMovie (1 eps)Jan 2021 - Jan ...,9.04,1 eps
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148 eps
8,9,Kaguya-sama wa Kokurasetai: Ultra RomanticTV (...,9.04,13 eps
9,10,Gintama': EnchousenTV (13 eps)Oct 2012 - Mar 2...,9.03,13 eps


In [155]:
df['episodes'] = df['episodes'].str.replace(" eps","")

In [156]:
df

Unnamed: 0,Rank,Title,Score,episodes
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10
5,6,"Gintama'TV (51 eps)Apr 2011 - Mar 2012534,105 ...",9.04,51
6,7,Gintama: The FinalMovie (1 eps)Jan 2021 - Jan ...,9.04,1
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148
8,9,Kaguya-sama wa Kokurasetai: Ultra RomanticTV (...,9.04,13
9,10,Gintama': EnchousenTV (13 eps)Oct 2012 - Mar 2...,9.03,13


In [157]:
df['episodes'] = df['episodes'].astype(int)

In [158]:
df

Unnamed: 0,Rank,Title,Score,episodes
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10
5,6,"Gintama'TV (51 eps)Apr 2011 - Mar 2012534,105 ...",9.04,51
6,7,Gintama: The FinalMovie (1 eps)Jan 2021 - Jan ...,9.04,1
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148
8,9,Kaguya-sama wa Kokurasetai: Ultra RomanticTV (...,9.04,13
9,10,Gintama': EnchousenTV (13 eps)Oct 2012 - Mar 2...,9.03,13


In [160]:
#make a new column for time stamp
df['Title']
df.loc[0]['Title']

'Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr 2009 - Jul 20103,218,472 membersManga StoreVolume 1€4.58Preview'

In [169]:
def extract_time(txt):
    data = ""
    for i in range(len(txt)):
        if txt[i]== ')':
            for j in range(i+1,i+20):# as 18 characters is date
                data +=txt[j]


            return data
            

In [170]:
df['Timestamp'] = df['Title'].apply(extract_time)

In [171]:
df

Unnamed: 0,Rank,Title,Score,episodes,Timestamp
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64,Apr 2009 - Jul 2010
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24,Apr 2011 - Sep 2011
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13,Oct 2022 - Dec 2022
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51,Apr 2015 - Mar 2016
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10,Apr 2019 - Jul 2019
5,6,"Gintama'TV (51 eps)Apr 2011 - Mar 2012534,105 ...",9.04,51,Apr 2011 - Mar 2012
6,7,Gintama: The FinalMovie (1 eps)Jan 2021 - Jan ...,9.04,1,Jan 2021 - Jan 2021
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148,Oct 2011 - Sep 2014
8,9,Kaguya-sama wa Kokurasetai: Ultra RomanticTV (...,9.04,13,Apr 2022 - Jun 2022
9,10,Gintama': EnchousenTV (13 eps)Oct 2012 - Mar 2...,9.03,13,Oct 2012 - Mar 2013


In [172]:
from dateutil.relativedelta import relativedelta
from datetime import datetime

In [175]:

from dateutil.relativedelta import relativedelta
from datetime import datetime

def calculate_total_months(period):
    try:
        start_str, end_str = period.split(' - ')
        start_date = datetime.strptime(start_str, '%b %Y')
        end_date = datetime.strptime(end_str, '%b %Y')
        r = relativedelta(end_date, start_date)
        return r.years * 12 + r.months + 1  # +1 to include the starting month
    except:
        return None

df['Months'] = df['Timestamp'].apply(calculate_total_months)

In [176]:
df

Unnamed: 0,Rank,Title,Score,episodes,Timestamp,Months
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64,Apr 2009 - Jul 2010,16
1,2,"Steins;GateTV (24 eps)Apr 2011 - Sep 20112,473...",9.07,24,Apr 2011 - Sep 2011,6
2,3,Bleach: Sennen Kessen-henTV (13 eps)Oct 2022 -...,9.06,13,Oct 2022 - Dec 2022,3
3,4,"Gintama°TV (51 eps)Apr 2015 - Mar 2016605,113 ...",9.06,51,Apr 2015 - Mar 2016,12
4,5,Shingeki no Kyojin Season 3 Part 2TV (10 eps)A...,9.05,10,Apr 2019 - Jul 2019,4
5,6,"Gintama'TV (51 eps)Apr 2011 - Mar 2012534,105 ...",9.04,51,Apr 2011 - Mar 2012,12
6,7,Gintama: The FinalMovie (1 eps)Jan 2021 - Jan ...,9.04,1,Jan 2021 - Jan 2021,1
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148,Oct 2011 - Sep 2014,36
8,9,Kaguya-sama wa Kokurasetai: Ultra RomanticTV (...,9.04,13,Apr 2022 - Jun 2022,3
9,10,Gintama': EnchousenTV (13 eps)Oct 2012 - Mar 2...,9.03,13,Oct 2012 - Mar 2013,6


In [177]:
#which anime has the highest score
#give me top 5 highest scoring anime
#which anime has the highest episode count
#animes with top 5 episode count
#which is the longest running anime

In [180]:
df[df['Score'] == df['Score'].max()]

Unnamed: 0,Rank,Title,Score,episodes,Timestamp,Months
0,1,Fullmetal Alchemist: BrotherhoodTV (64 eps)Apr...,9.1,64,Apr 2009 - Jul 2010,16


In [185]:
df['episodes'].head()

0    64
1    24
2    13
3    51
4    10
Name: episodes, dtype: int64

In [186]:
df[df['episodes'] == df['episodes'].max()]

Unnamed: 0,Rank,Title,Score,episodes,Timestamp,Months
15,16,"GintamaTV (201 eps)Apr 2006 - Mar 20101,034,41...",8.94,201,Apr 2006 - Mar 2010,48


In [189]:
df.sort_values(by='episodes',ascending = False).head()

Unnamed: 0,Rank,Title,Score,episodes,Timestamp,Months
15,16,"GintamaTV (201 eps)Apr 2006 - Mar 20101,034,41...",8.94,201,Apr 2006 - Mar 2010,48
7,8,Hunter x Hunter TV (148 eps)Oct 2011 - Sep 201...,9.04,148,Oct 2011 - Sep 2014,36
11,12,Ginga Eiyuu DensetsuOVA (110 eps)Jan 1988 - Ma...,9.02,110,Jan 1988 - Mar 1997,111
42,43,Hajime no IppoTV (75 eps)Oct 2000 - Mar 200255...,8.76,75,Oct 2000 - Mar 2002,18
24,25,"MonsterTV (74 eps)Apr 2004 - Sep 20051,041,081...",8.87,74,Apr 2004 - Sep 2005,18


In [190]:
df[df['Months'] == df['Months'].max()]

Unnamed: 0,Rank,Title,Score,episodes,Timestamp,Months
11,12,Ginga Eiyuu DensetsuOVA (110 eps)Jan 1988 - Ma...,9.02,110,Jan 1988 - Mar 1997,111


**Project 2**

In [191]:
import pandas as pd
import numpy as np

In [192]:
df = pd.read_csv('countries.csv')

In [193]:
df.describe()

Unnamed: 0,latitude,longitude,agricultural_land,forest_area,land_area,rural_land,urban_land,central_government_debt_pct_gdp,expense_pct_gdp,gdp,...,net_migration,population_female,population_male,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,median_age
count,194.0,194.0,193.0,194.0,194.0,194.0,194.0,120.0,156.0,193.0,...,194.0,194.0,194.0,194.0,193.0,194.0,194.0,194.0,194.0,194.0
mean,18.975601,22.027491,245455.1,208678.4,667508.7,656371.1,9777.116531,66.759366,30.051403,514485100000.0,...,-51.407216,20283160.0,20505720.0,40788880.0,25.022994,17633220.0,23155660.0,2.53933,4.644536,25.661856
std,23.876225,66.396389,635626.8,782492.6,1837107.0,1811169.0,42301.458421,71.806247,26.74088,2307148000000.0,...,94525.968598,72589410.0,76071580.0,148647000.0,12.671044,76641870.0,79403930.0,1.800128,2.818297,9.415569
min,-41.0,-175.0,4.0,0.0,2.027,0.0349545,0.0,0.0,0.000267,60349400.0,...,-525116.0,5513.0,5799.0,11312.0,0.0,0.0,5717.0,0.0,0.0,10.5
25%,4.0,-5.0,6464.0,3331.775,23552.5,21865.62,359.61825,31.9513,18.371875,11813900000.0,...,-12242.25,1036218.0,1044902.0,2106358.0,15.3846,589664.0,1222244.0,1.525,2.7225,16.95
50%,16.583333,21.5,38727.8,25289.25,120375.0,115994.5,1645.17,55.42685,27.33775,41153900000.0,...,-970.0,4502713.0,4450049.0,9125614.0,25.2525,2512382.0,4508837.0,2.4,5.05,24.95
75%,40.0,50.1625,215000.0,123673.5,523700.0,491150.5,4054.0225,79.5393,35.0835,251945000000.0,...,2904.25,15266060.0,14788720.0,30313610.0,33.6364,11333540.0,16213550.0,2.925,6.9675,34.05
max,65.0,178.0,5285080.0,8153120.0,16376900.0,16224200.0,522345.0,687.994,310.443,25462700000000.0,...,561580.0,691528500.0,731180500.0,1417173000.0,61.25,908804800.0,897578400.0,10.0,9.87,50.5


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 64 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   country                                  194 non-null    object 
 1   country_long                             194 non-null    object 
 2   currency                                 194 non-null    object 
 3   capital_city                             194 non-null    object 
 4   region                                   194 non-null    object 
 5   continent                                194 non-null    object 
 6   demonym                                  194 non-null    object 
 7   latitude                                 194 non-null    float64
 8   longitude                                194 non-null    float64
 9   agricultural_land                        193 non-null    float64
 10  forest_area                              194 non-n

In [197]:
df.size

12416

In [199]:
df.shape

(194, 64)

In [201]:
df.head()

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
0,Afghanistan,Islamic State of Afghanistan,Afghan afghani,Kabul,Southern Asia,Asia,Afghan,33.0,65.0,383560.0,...,41128771,27.0161,30181937,10946834,2.14,2.97,Authoritarian,12.9,Ashraf Ghani,President
1,Albania,Republic of Albania,Albanian lek,Tirana,Southern Europe,Europe,Albanian,41.0,20.0,11655.5,...,2775634,35.7143,1004807,1770827,2.62,5.98,Hybrid regime,33.7,Edi Rama,Prime Minister
2,Algeria,People's Democratic Republic of Algeria,Algerian dinar,Algiers,Northern Africa,Africa,Algerian,28.0,3.0,413588.0,...,44903225,8.10811,11328186,33575039,1.71,3.5,Authoritarian,24.0,Abdelmadjid Tebboune,President
3,Andorra,Principality of Andorra,Euro,Andorra la Vella,Southern Europe,Europe,Andorran,42.5,1.5,187.2,...,79824,46.4286,9730,70094,3.17,0.0,Unknown,38.9,Xavier Espot Zamora,Head of Government
4,Angola,People's Republic of Angola,Angolan kwanza,Luanda,Middle Africa,Africa,Angolan,-12.5,18.5,569525.0,...,35588987,33.6364,11359649,24229338,2.24,3.62,Authoritarian,12.4,João Lourenço,President


In [203]:
df.isna().sum()
#political_leader    7
#title               7

country             0
country_long        0
currency            0
capital_city        0
region              0
                   ..
democracy_score     0
democracy_type      0
median_age          0
political_leader    7
title               7
Length: 64, dtype: int64

In [206]:
df.isna().any()

country             False
country_long        False
currency            False
capital_city        False
region              False
                    ...  
democracy_score     False
democracy_type      False
median_age          False
political_leader     True
title                True
Length: 64, dtype: bool

In [224]:

df.columns


Index(['country', 'country_long', 'currency', 'capital_city', 'region',
       'continent', 'demonym', 'latitude', 'longitude', 'agricultural_land',
       'forest_area', 'land_area', 'rural_land', 'urban_land',
       'central_government_debt_pct_gdp', 'expense_pct_gdp', 'gdp',
       'inflation', 'self_employed_pct', 'tax_revenue_pct_gdp',
       'unemployment_pct', 'vulnerable_employment_pct',
       'electricity_access_pct', 'alternative_nuclear_energy_pct',
       'electricty_production_coal_pct',
       'electricty_production_hydroelectric_pct',
       'electricty_production_gas_pct', 'electricty_production_nuclear_pct',
       'electricty_production_oil_pct', 'electricty_production_renewable_pct',
       'energy_imports_pct', 'fossil_energy_consumption_pct',
       'renewable_energy_consumption_pct', 'co2_emissions',
       'methane_emissions', 'nitrous_oxide_emissions',
       'greenhouse_other_emissions', 'urban_population_under_5m',
       'health_expenditure_pct_gdp', 'healt

In [225]:

#which country has the highest population
df[df['population']==df['population'].max()]['country']

75    India
Name: country, dtype: object

In [226]:
#what is the capital of the country with highest population
df[df['population'] == df['population'].max()]['capital_city']

75    New Delhi
Name: capital_city, dtype: object

In [227]:
#which country has the least population
df[df['population'] == df['population'].min()]['country']

179    Tuvalu
Name: country, dtype: object

In [228]:
#what is the capital of the country with least population
df[df['population'] == df['population'].min()]['capital_city']

179    Funafuti
Name: capital_city, dtype: object

In [229]:
#give me top 5 countries with highest democratic score
df.sort_values(by = 'democracy_score',ascending = False).head()

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
127,Norway,Kingdom of Norway,Norwegian krone,Oslo,Northern Europe,Europe,Norwegian,62.0,10.0,9859.62,...,5457127,44.9704,891476,4565651,10.0,9.87,Full democracy,35.6,Erna Solberg,Prime Minister
74,Iceland,Republic of Iceland,Iceland krona,Reykjavík,Northern Europe,Europe,Icelander,65.0,-18.0,18720.0,...,381900,47.619,22945,358955,5.32,9.58,Full democracy,32.1,Katrín Jakobsdóttir,Prime Minister
164,Sweden,Kingdom of Sweden,Swedish krona,Stockholm,Northern Europe,Europe,Swedish,62.0,15.0,30055.4,...,10486941,46.4183,1206837,9280104,9.41,9.39,Full democracy,35.6,Stefan Löfven,Prime Minister
122,New Zealand,New Zealand,New Zealand dollar,Wellington,Australia and New Zealand,Oceania,New Zealander,-41.0,174.0,101540.0,...,5124100,50.4202,672077,4452023,7.27,9.26,Full democracy,32.8,Jacinda Ardern,Prime Minister
46,Denmark,Kingdom of Denmark,Danish krone,Copenhagen,Northern Europe,Europe,Danish,56.0,10.0,26199.9,...,5903037,43.5754,686700,5216337,7.92,9.22,Full democracy,37.2,Mette Frederiksen,Prime Minister


In [232]:
# how many total regions are there
df['region'].value_counts()

region
Eastern Africa               17
Western Asia                 17
Western Africa               16
Southern Europe              15
Caribbean                    13
South America                12
South-Eastern Asia           11
Northern Europe              10
Middle Africa                10
Eastern Europe               10
Western Europe                9
Southern Asia                 9
Central America               8
Northern Africa               6
Micronesia                    5
Southern Africa               5
Eastern Asia                  5
Central Asia                  5
Melanesia                     4
Polynesia                     3
Australia and New Zealand     2
Northern America              2
Name: count, dtype: int64

In [236]:
df['region'].value_counts().count()
# The count() method is used to count the number of non-NA/null observations across a given axis. It works with both DataFrames and Series. This method is useful when you want to know how many non-null entries exist in a DataFrame or Series.
# The value_counts() method returns a Series containing counts of unique values. This method is particularly useful for categorical data, as it provides a quick overview of the frequency distribution of each category.

np.int64(22)

In [237]:
#how many countries lie in Eastern Europe region
df['region'].value_counts()['Eastern Europe']

np.int64(10)

In [240]:
df[df['region'] == 'Eastern Europe']['country']

14             Belarus
24            Bulgaria
43      Czech Republic
73             Hungary
111            Moldova
136             Poland
139            Romania
140             Russia
151    Slovak Republic
181            Ukraine
Name: country, dtype: object

In [241]:
#who is the political leader of the 2nd highest populated country
df[df['population'] == df['population'].nlargest(2).iloc[1]]['political_leader']

34    Xi Jinping
Name: political_leader, dtype: object

In [246]:
# how many countries are there whoes political leaders are unknown
df[df['political_leader'].isna()]['country'].count()

np.int64(7)

In [247]:
#how many country have Republic in their full name
count = 0
def counting(txt):
    global count
    if 'republic' in txt.lower():
        count +=1
    return txt
df['country_long'] = df['country_long'].apply(counting) 
print(count)

125


In [248]:
#which country in african region has highest population
df_africa = df[df['continent'] == 'Africa']

In [249]:
df_africa

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
2,Algeria,People's Democratic Republic of Algeria,Algerian dinar,Algiers,Northern Africa,Africa,Algerian,28.0,3.0,413588.0,...,44903225,8.10811,11328186,33575039,1.71,3.5,Authoritarian,24.0,Abdelmadjid Tebboune,President
4,Angola,People's Republic of Angola,Angolan kwanza,Luanda,Middle Africa,Africa,Angolan,-12.5,18.5,569525.0,...,35588987,33.6364,11359649,24229338,2.24,3.62,Authoritarian,12.4,João Lourenço,President
17,Benin,Republic of Benin,West African CFA franc,Porto-Novo,Western Africa,Africa,Beninese,9.5,2.25,39500.0,...,13352864,7.40741,6738656,6614208,2.46,5.74,Hybrid regime,13.6,Patrice Talon,President
21,Botswana,Republic of Botswana,Botswana pula,Gaborone,Southern Africa,Africa,Motswana,-22.0,24.0,258620.0,...,2630296,11.1111,730591,1899705,3.12,7.81,Flawed democracy,19.7,Mokgweetsi Masisi,President
25,Burkina Faso,Burkina Faso,West African CFA franc,Ouagadougou,Western Africa,Africa,Burkinabe,13.0,-2.0,121430.0,...,22673762,16.9014,15446047,7227715,3.19,4.75,Hybrid regime,12.9,Roch Marc Christian Kaboré,President
26,Burundi,Republic of Burundi,Burundi franc,Bujumbura,Eastern Africa,Africa,Burundian,-3.5,30.0,20330.0,...,12889576,38.2114,11031286,1858290,1.48,2.33,Authoritarian,11.9,Pierre Nkurunziza,President
27,Cabo Verde,Republic of Cabo Verde,Cabo Verde escudo,Praia,Western Africa,Africa,Cape Verdian,16.0,-24.0,790.0,...,593149,38.8889,192507,400642,3.95,7.88,Flawed democracy,22.9,,
29,Cameroon,Republic of Cameroon,Central African CFA franc,Yaoundé,Middle Africa,Africa,Cameroonian,6.0,12.0,97500.0,...,27914536,33.8889,11519492,16395044,1.81,3.28,Authoritarian,13.6,Paul Biya,President
31,Central African Republic,Central African Republic,Central African CFA franc,Bangui,Middle Africa,Africa,Central African,7.0,21.0,50800.0,...,5579144,12.8571,3173417,2405727,1.65,1.52,Authoritarian,10.8,Faustin-Archange Touadéra,President
32,Chad,Republic of Chad,Central African CFA franc,N'Djamena,Middle Africa,Africa,Chadian,15.0,19.0,502380.0,...,17723315,25.8883,13458554,4264761,2.13,1.61,Authoritarian,11.2,Idriss Déby,President


In [251]:
df_africa[df_africa['population'] == df_africa['population'].max()]['country']

125    Nigeria
Name: country, dtype: object