## 8.1. Selecting Data using Indexing and Slicing

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

# sets the default style for plotting
sns.set_style("darkgrid")

titanic_data = sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### 8.1.1. Selecting Data using [ ]

In [2]:
print(titanic_data["class"])
type(titanic_data["class"])

0       Third
1       First
2       Third
3       First
4       Third
        ...  
886    Second
887     First
888     Third
889     First
890     Third
Name: class, Length: 891, dtype: category
Categories (3, object): ['First', 'Second', 'Third']


pandas.core.series.Series

In [3]:
print(type(titanic_data[["class", "sex", "age"]]))
titanic_data[["class", "sex", "age"]]

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,class,sex,age
0,Third,male,22.0
1,First,female,38.0
2,Third,female,26.0
3,First,female,35.0
4,Third,male,35.0
...,...,...,...
886,Second,male,27.0
887,First,female,19.0
888,Third,female,
889,First,male,26.0


In [4]:
my_df = titanic_data[titanic_data["sex"] == "male"]
my_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [5]:
my_df = titanic_data[(titanic_data["sex"] == "male") & (titanic_data["class"] == "First") ]
my_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
23,1,1,male,28.0,0,0,35.5,S,First,man,True,A,Southampton,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
30,0,1,male,40.0,0,0,27.7208,C,First,man,True,,Cherbourg,no,True
34,0,1,male,28.0,1,0,82.1708,C,First,man,True,,Cherbourg,no,False


In [6]:
ages = [20,21,22]
age_dataset = titanic_data[titanic_data["age"].isin(ages)]
age_dataset.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
12,0,3,male,20.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
37,0,3,male,21.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
51,0,3,male,21.0,0,0,7.8,S,Third,man,True,,Southampton,no,True
56,1,2,female,21.0,0,0,10.5,S,Second,woman,False,,Southampton,yes,True


### 8.1.2. Indexing and Slicing Using Loc

In [7]:

import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [8]:
print(my_df.loc[2])
type(my_df.loc[2])

Subject    English
Score           76
Grade            C
Remarks       Fair
Name: 2, dtype: object


pandas.core.series.Series

In [9]:
my_df.loc[2:4]


Unnamed: 0,Subject,Score,Grade,Remarks
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [10]:
my_df.loc[2:4, ["Grade", "Score"]]

Unnamed: 0,Grade,Score
2,C,76
3,C,72
4,A,95


In [11]:

import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores, index = ["Student1", "Student2", "Student3", "Student4", "Student5"])
my_df

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student3,English,76,C,Fair
Student4,Science,72,C,Fair
Student5,Arts,95,A,Excellent


In [12]:
my_df.loc["Student1"]

Subject    Mathematics
Score               85
Grade                B
Remarks           Good
Name: Student1, dtype: object

In [13]:
my_df.loc["Student1", "Grade"]


'B'

In [14]:
my_df.loc["Student1":"Student2", "Grade"]

Student1    B
Student2    A
Name: Grade, dtype: object

In [15]:
my_df.loc["Student1":"Student4", "Grade"]

Student1    B
Student2    A
Student3    C
Student4    C
Name: Grade, dtype: object

In [16]:
my_df.loc[[False, False, False, True, False]]

Unnamed: 0,Subject,Score,Grade,Remarks
Student4,Science,72,C,Fair


In [17]:
my_df["Score"]>80

Student1     True
Student2     True
Student3    False
Student4    False
Student5     True
Name: Score, dtype: bool

In [18]:
my_df.loc[my_df["Score"]>80]

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student5,Arts,95,A,Excellent


In [19]:
my_df.loc[(my_df["Score"]>80) & (my_df["Remarks"] == "Excellent")]

Unnamed: 0,Subject,Score,Grade,Remarks
Student2,History,98,A,Excellent
Student5,Arts,95,A,Excellent


In [20]:
my_df.loc[my_df["Score"]>80, ["Score","Grade"]]

Unnamed: 0,Score,Grade
Student1,85,B
Student2,98,A
Student5,95,A


In [21]:
my_df.loc["Student4"] = 90
my_df

Unnamed: 0,Subject,Score,Grade,Remarks
Student1,Mathematics,85,B,Good
Student2,History,98,A,Excellent
Student3,English,76,C,Fair
Student4,90,90,90,90
Student5,Arts,95,A,Excellent


### 8.1.3. Indexing and Slicing Using iLoc

In [22]:
import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [23]:
my_df.iloc[3]

Subject    Science
Score           72
Grade            C
Remarks       Fair
Name: 3, dtype: object

In [24]:
my_df.iloc[[3]]

Unnamed: 0,Subject,Score,Grade,Remarks
3,Science,72,C,Fair


In [25]:
my_df.iloc[[2,3]]

Unnamed: 0,Subject,Score,Grade,Remarks
2,English,76,C,Fair
3,Science,72,C,Fair


In [26]:
my_df.iloc[2:4]

Unnamed: 0,Subject,Score,Grade,Remarks
2,English,76,C,Fair
3,Science,72,C,Fair


In [27]:
my_df.iloc[[2,3], [0,1]]

Unnamed: 0,Subject,Score
2,English,76
3,Science,72


In [28]:
my_df.iloc[2:4, 0:2]

Unnamed: 0,Subject,Score
2,English,76
3,Science,72


## 8.2. Dropping Rows and Columns with drop() Method

### 8.2.1. Dropping Rows

In [29]:
import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [30]:
my_df2 = my_df.drop([1,4])
my_df2.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
2,English,76,C,Fair
3,Science,72,C,Fair


In [31]:
my_df2.reset_index(inplace=True)
my_df2.head()

Unnamed: 0,index,Subject,Score,Grade,Remarks
0,0,Mathematics,85,B,Good
1,2,English,76,C,Fair
2,3,Science,72,C,Fair


In [32]:
my_df2 = my_df.drop([1,4])
my_df2.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
2,English,76,C,Fair
3,Science,72,C,Fair


In [33]:
my_df2.reset_index(inplace=True, drop = True)
my_df2.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,English,76,C,Fair
2,Science,72,C,Fair


In [34]:
my_df.drop([1,3,4])
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [35]:
my_df.drop([1,3,4], inplace = True)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
2,English,76,C,Fair


### 8.2.2. Dropping Columns

In [56]:
import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [57]:
my_df2 = my_df.drop(["Subject", "Grade"], axis = 1)
my_df2.head()

Unnamed: 0,Score,Remarks
0,85,Good
1,98,Excellent
2,76,Fair
3,72,Fair
4,95,Excellent


In [58]:
my_df.drop(["Subject", "Grade"], axis = 1, inplace = True)
my_df.head()

Unnamed: 0,Score,Remarks
0,85,Good
1,98,Excellent
2,76,Fair
3,72,Fair
4,95,Excellent


## 8.3. Filtering Rows and Columns with filter() Method

### 8.3.1. Filtering Rows

In [41]:
import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [42]:
my_df2 = my_df.filter([1,3,4], axis = 0)
my_df2.head()

Unnamed: 0,Subject,Score,Grade,Remarks
1,History,98,A,Excellent
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [43]:
my_df2 = my_df2.reset_index(drop=True)
my_df2.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,History,98,A,Excellent
1,Science,72,C,Fair
2,Arts,95,A,Excellent


### 8.3.2. Filtering Columns

In [44]:
import pandas as pd

scores = [
          {'Subject':'Mathematics', 'Score':85, 'Grade': 'B', 'Remarks': 'Good', },
          {'Subject':'History', 'Score':98, 'Grade': 'A','Remarks': 'Excellent'},
          {'Subject':'English', 'Score':76, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Science', 'Score':72, 'Grade': 'C','Remarks': 'Fair'},
          {'Subject':'Arts', 'Score':95, 'Grade': 'A','Remarks': 'Excellent'},
         ]

my_df = pd.DataFrame(scores)
my_df.head()

Unnamed: 0,Subject,Score,Grade,Remarks
0,Mathematics,85,B,Good
1,History,98,A,Excellent
2,English,76,C,Fair
3,Science,72,C,Fair
4,Arts,95,A,Excellent


In [45]:
my_df2 = my_df.filter(["Score","Grade"], axis = 1)
my_df2.head()

Unnamed: 0,Score,Grade
0,85,B
1,98,A
2,76,C
3,72,C
4,95,A


## 8.4. Sorting Dataframes

In [46]:
import matplotlib.pyplot as plt
import seaborn as sns

# sets the default style for plotting
sns.set_style("darkgrid")

titanic_data = sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [47]:
age_sorted_data = titanic_data.sort_values(by=['age'])
age_sorted_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False


In [48]:
age_sorted_data = titanic_data.sort_values(by=['age'], ascending = False)
age_sorted_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [49]:
age_sorted_data = titanic_data.sort_values(by=['age','fare'], ascending = False)
age_sorted_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
630,1,1,male,80.0,0,0,30.0,S,First,man,True,A,Southampton,yes,True
851,0,3,male,74.0,0,0,7.775,S,Third,man,True,,Southampton,no,True
493,0,1,male,71.0,0,0,49.5042,C,First,man,True,,Cherbourg,no,True
96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


## 8.5. Pandas Unique and Count Functions

In [50]:
import matplotlib.pyplot as plt
import seaborn as sns

# sets the default style for plotting
sns.set_style("darkgrid")

titanic_data = sns.load_dataset('titanic')
titanic_data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [51]:
titanic_data["class"].unique()

['Third', 'First', 'Second']
Categories (3, object): ['Third', 'First', 'Second']

In [52]:
titanic_data["class"].nunique()

3

In [53]:
titanic_data.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [54]:
titanic_data["class"].value_counts()

Third     491
First     216
Second    184
Name: class, dtype: int64

## Exercise 8.1


### Question 1

Which function is used to sort pandas dataframe by a column value

A- sort_dataframe() 

B- sort_rows()  

C- sort_values() 

D- sort_records()

Answer: C


### Question 2

To filter columns from a Pandas dataframe, you have to pass a list of column names to one of the following method:


A- filter()  

B- filter_columns()  

C- apply_filter () 

D- None of the above()

Answer: A


### Question 3

To drop the second and fourth row from a Pandas dataframe named `my_df`, you can use the following script:

A. my_df.drop([2,4])

B. my_df.drop([1,3])

C. my_df.delete([2,4])

D. my_df.delete([1,3])

Answer: B


## Exercise 8.2

From the titanic dataset, filter all the records where fare is greater than 20 and the passenger travelled alone. 
You can access the titanic dataset using the following Seaborn command:
    
```
import seaborn as sns

titanic_data = sns.load_dataset('titanic')
```

**Solution:**

In [55]:
import seaborn as sns

titanic_data = sns.load_dataset('titanic')

my_df = titanic_data[(titanic_data["fare"] > 50) & (titanic_data["alone"] == True) ]
my_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
61,1,1,female,38.0,0,0,80.0,,First,woman,False,B,,yes,True
72,0,2,male,21.0,0,0,73.5,S,Second,man,True,,Southampton,no,True
74,1,3,male,32.0,0,0,56.4958,S,Third,man,True,,Southampton,yes,True
110,0,1,male,47.0,0,0,52.0,S,First,man,True,C,Southampton,no,True
