#### Create a DataFrame df from the dictionary data with index labels:

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df = pd.DataFrame(data, index=labels)

#### Display a summary of the basic information about this DataFrame and its data

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, a to j
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   animal    10 non-null     object 
 1   age       8 non-null      float64
 2   visits    10 non-null     int64  
 3   priority  10 non-null     object 
dtypes: float64(1), int64(1), object(2)
memory usage: 400.0+ bytes


#### Return the first 3 rows of the DataFrame df

In [4]:
df.head(3)

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


#### Select just the ‘animal’ and ‘age’ columns from the DataFrame df

In [5]:
df[['animal', 'age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


#### Select the data in rows [3, 4, 8] and in columns ['animal', 'age']

In [6]:
df.loc[df.index[[3, 4, 8]], ['animal', 'age']]

Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


#### Select only the rows where the number of visits is greater than 3

In [7]:
df[df['visits'] > 3]

Unnamed: 0,animal,age,visits,priority


#### Select the rows where the age is missing, i.e., it is NaN

In [8]:
df[df['age'].isna()]

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


#### Select the rows where the animal is a cat and the age is less than 3

In [9]:
df[(df['animal'] == 'cat') & (df['age'] < 3)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
f,cat,2.0,3,no


#### Select the rows where the age is between 2 and 4 (inclusive)

In [10]:
df[df['age'].between(2, 4)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


#### Change the age in row ‘f’ to 1.5

In [11]:
df.at['f', 'age'] = 1.5

#### Calculate the sum of all visits in df

In [12]:
df['visits'].sum()

19

#### Calculate the mean age for each different animal in df

In [13]:
df.groupby('animal')['age'].mean()

animal
cat      2.333333
dog      5.000000
snake    2.500000
Name: age, dtype: float64

#### Append a new row ‘k’ to df with your choice of values for each column, then delete that row to return the original DataFrame

In [14]:
df.loc['k'] = ['dog', 5.5, 2, 'no']
df = df.drop('k')

#### Count the number of each type of animal in df

In [15]:
df['animal'].value_counts()

animal
cat      4
dog      4
snake    2
Name: count, dtype: int64

#### Sort df first by the values in the ‘age’ in descending order, then by the value in the ‘visits’ column in ascending order

In [16]:
df.sort_values(by=['age', 'visits'], ascending=[False, True])

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,1.5,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


#### Replace the ‘priority’ column with boolean values (‘yes’ should be True and ‘no’ should be False)

In [17]:
df['priority'] = df['priority'].map({'yes': True, 'no': False})

#### In the ‘animal’ column, change the ‘snake’ entries to ‘python’

In [18]:
df['animal'] = df['animal'].replace('snake', 'python')

#### For each animal type and each number of visits, find the mean age

In [19]:
df.pivot_table(index='animal', columns='visits', values='age', aggfunc='mean')

visits,1,2,3
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,2.5,,2.25
dog,3.0,6.0,
python,4.5,0.5,


### Data Frames Advanced

##### Filter out rows which contain the same integer as the row immediately above

In [20]:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df = df.loc[df['A'].shift() != df['A']]

#### Subtract the row mean from each element in the row

In [21]:
df = pd.DataFrame(np.random.random(size=(5, 3)))
df = df.sub(df.mean(axis=1), axis=0)

#### Which column of numbers has the smallest sum? Return that column’s label

In [22]:
df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
smallest_sum_column = df.sum().idxmin()

#### Count how many unique rows a DataFrame has (ignore all rows that are duplicates)

In [23]:
nan = np.nan

data = [[0.04,  nan,  nan, 0.25,  nan, 0.43, 0.71, 0.51,  nan,  nan],
        [ nan,  nan,  nan, 0.04, 0.76,  nan,  nan, 0.67, 0.76, 0.16],
        [ nan,  nan, 0.5 ,  nan, 0.31, 0.4 ,  nan,  nan, 0.24, 0.01],
        [0.49,  nan,  nan, 0.62, 0.73, 0.26, 0.85,  nan,  nan,  nan],
        [ nan,  nan, 0.41,  nan, 0.05,  nan, 0.61,  nan, 0.48, 0.68]]

columns = list('abcdefghij')

df = pd.DataFrame(data, columns=columns)

third_nan_column = df.apply(lambda row: row.index[row.isna().cumsum() == 3][0], axis=1)