In [1]:
# Teapod - Intro to Python Programming

#Pandas Basics

# Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [2]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Basics of Python

In [3]:
#Pandas dataframe
"""
Pandas is a high-level data manipulation tool developed by Wes McKinney.
It is built on the Numpy package and its key data structure is called the DataFrame.
DataFrames allow you to store and manipulate tabular data in rows of observations and columns of variables.

There are several ways to create a DataFrame. One way way is to use a dictionary.
"""

dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "capital": ["Brasilia", "Moscow", "New Delhi", "Beijing", "Pretoria"],
       "area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "population": [200.4, 143.5, 1252, 1357, 52.98] }

import pandas as pd
brics = pd.DataFrame(dict)
print(brics)

        country    capital    area  population
0        Brazil   Brasilia   8.516      200.40
1        Russia     Moscow  17.100      143.50
2         India  New Delhi   3.286     1252.00
3         China    Beijing   9.597     1357.00
4  South Africa   Pretoria   1.221       52.98


In [4]:
# Set the index for brics
brics.index = ["BR", "RU", "IN", "CH", "SA"]

# Print out brics with new index values
print(brics)

         country    capital    area  population
BR        Brazil   Brasilia   8.516      200.40
RU        Russia     Moscow  17.100      143.50
IN         India  New Delhi   3.286     1252.00
CH         China    Beijing   9.597     1357.00
SA  South Africa   Pretoria   1.221       52.98


In [5]:
#Another way to create a DataFrame is by importing a csv file using Pandas.
#Now, the csv Health.csv is stored and can be imported using pd.read_csv

# Import pandas as pd
import pandas as pd

# Import the cars.csv data: cars
df = pd.read_csv('Health.csv')

# Print out cars
print(df)

  Ethnicity  Height (CM)  Weight (Kg) Will survive till 70
0     White        186.0         90.0                  Yes
1   African        185.0         98.0                   No
2     White        175.0         80.0                   No
3     White        180.0         88.0                  Yes
4     Asian        178.0          NaN                   No
5     Asian        172.0         72.0                  Yes
6   African        178.0         75.0                   No
7     White          NaN         89.0                  Yes
8   African        186.0         90.0                  Yes


In [6]:
df.columns

Index(['Ethnicity', 'Height (CM)', 'Weight (Kg)', 'Will survive till 70'], dtype='object')

In [7]:
df.head(5)

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
0,White,186.0,90.0,Yes
1,African,185.0,98.0,No
2,White,175.0,80.0,No
3,White,180.0,88.0,Yes
4,Asian,178.0,,No


In [8]:
# Import cars data
import pandas as pd
df = pd.read_csv('Health.csv')

# Print out first 4 observations
print(df[0:4])
print("\n")
# Print out fifth, sixth, and seventh observation
print(df[4:6])

  Ethnicity  Height (CM)  Weight (Kg) Will survive till 70
0     White        186.0         90.0                  Yes
1   African        185.0         98.0                   No
2     White        175.0         80.0                   No
3     White        180.0         88.0                  Yes


  Ethnicity  Height (CM)  Weight (Kg) Will survive till 70
4     Asian        178.0          NaN                   No
5     Asian        172.0         72.0                  Yes


In [9]:
df.describe()

Unnamed: 0,Height (CM),Weight (Kg)
count,8.0,8.0
mean,180.0,85.25
std,5.264436,8.762746
min,172.0,72.0
25%,177.25,78.75
50%,179.0,88.5
75%,185.25,90.0
max,186.0,98.0


In [10]:
#You can also use loc and iloc to perform just about any data selection operation.
#loc is label-based, which means that you have to specify rows and columns based on their row and column labels.
#iloc is integer index based, so you have to specify rows and columns by their integer index like you did in the
#previous exercise


print(df.iloc[2])
print("\n")

Ethnicity               White
Height (CM)             175.0
Weight (Kg)              80.0
Will survive till 70       No
Name: 2, dtype: object




In [11]:
df.dtypes

Ethnicity                object
Height (CM)             float64
Weight (Kg)             float64
Will survive till 70     object
dtype: object

In [13]:
df["Ethnicity"].unique()

array(['White', 'African', 'Asian'], dtype=object)

In [15]:
df.Ethnicity.value_counts()

White      4
African    3
Asian      2
Name: Ethnicity, dtype: int64

In [16]:
df[['Ethnicity', 'Will survive till 70']]

Unnamed: 0,Ethnicity,Will survive till 70
0,White,Yes
1,African,No
2,White,No
3,White,Yes
4,Asian,No
5,Asian,Yes
6,African,No
7,White,Yes
8,African,Yes


In [18]:
sorted(df["Height (CM)"],reverse=True)

[186.0, 185.0, 180.0, 178.0, 178.0, 175.0, 172.0, nan, 186.0]

In [19]:
df.describe(include="all")

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
count,9,8.0,8.0,9
unique,3,,,2
top,White,,,Yes
freq,4,,,5
mean,,180.0,85.25,
std,,5.264436,8.762746,
min,,172.0,72.0,
25%,,177.25,78.75,
50%,,179.0,88.5,
75%,,185.25,90.0,


In [20]:
df.isnull()

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,False,False,False
7,False,True,False,False
8,False,False,False,False


In [21]:
df.isnull().sum()

Ethnicity               0
Height (CM)             1
Weight (Kg)             1
Will survive till 70    0
dtype: int64

In [22]:
df.drop(0)

Unnamed: 0,Ethnicity,Height (CM),Weight (Kg),Will survive till 70
1,African,185.0,98.0,No
2,White,175.0,80.0,No
3,White,180.0,88.0,Yes
4,Asian,178.0,,No
5,Asian,172.0,72.0,Yes
6,African,178.0,75.0,No
7,White,,89.0,Yes
8,African,186.0,90.0,Yes


In [23]:
df.drop(["Weight (Kg)"],axis=1,inplace=True)

In [24]:
df

Unnamed: 0,Ethnicity,Height (CM),Will survive till 70
0,White,186.0,Yes
1,African,185.0,No
2,White,175.0,No
3,White,180.0,Yes
4,Asian,178.0,No
5,Asian,172.0,Yes
6,African,178.0,No
7,White,,Yes
8,African,186.0,Yes


In [25]:
df["new"]="value"

In [26]:
df

Unnamed: 0,Ethnicity,Height (CM),Will survive till 70,new
0,White,186.0,Yes,value
1,African,185.0,No,value
2,White,175.0,No,value
3,White,180.0,Yes,value
4,Asian,178.0,No,value
5,Asian,172.0,Yes,value
6,African,178.0,No,value
7,White,,Yes,value
8,African,186.0,Yes,value


In [28]:
df.iloc[ : ,0:2]

Unnamed: 0,Ethnicity,Height (CM)
0,White,186.0
1,African,185.0
2,White,175.0
3,White,180.0
4,Asian,178.0
5,Asian,172.0
6,African,178.0
7,White,
8,African,186.0


In [32]:
max(df["Height (CM)"])

186.0

In [33]:
x=np.where(df["Height (CM)"]==max(df["Height (CM)"]))
df.iloc[x]

Unnamed: 0,Ethnicity,Height (CM),Will survive till 70,new
0,White,186.0,Yes,value
8,African,186.0,Yes,value


In [34]:
df["Height (CM)"]==max(df["Height (CM)"])

0     True
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
Name: Height (CM), dtype: bool