# Basic data structures

* Series - 1D
* DataFrame - 2D

# Pandas Series Creation and Indexing

Use data from step tracking application to create a Pandas Series

In [13]:
import pandas as pd
import numpy as np

step_data = [3620, 7891, 9761, 3907, 4338, 5373]

step_counts = pd.Series(step_data, name='steps')

print(step_counts)

0    3620
1    7891
2    9761
3    3907
4    4338
5    5373
Name: steps, dtype: int64


Add a date range to the series

In [14]:
step_counts.index = pd.date_range('20150329', periods=6)

print(step_counts)

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


Select data by index values

In [15]:
# Just like a dictionary
print(step_counts['2015-04-01'])

3907


In [16]:
# Or by index position-like an array
print(step_counts[3])

3907


In [17]:
# Select all of April
print(step_counts['2015-04'])

2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: steps, dtype: int64


# Pandas data types and imputation 

Data types can be viewed and converted

In [18]:
print(step_counts.dtypes)

int64


In [19]:
# Convert to float
step_counts = step_counts.astype(np.float)
print(step_counts.dtypes)

float64


Invalid data points can be easily filled with values

In [20]:
# Create invalid data
step_counts[1:3] = np.nan

# Now fill it in with zeros
step_counts.fillna(0., inplace=True)

print(step_counts)

2015-03-29    3620.0
2015-03-30       0.0
2015-03-31       0.0
2015-04-01    3907.0
2015-04-02    4338.0
2015-04-03    5373.0
Freq: D, Name: steps, dtype: float64


# Pandas Dataframe creation and methods

DataFrames can be created from lists, dictionaries, and Pandas Series

In [22]:
# Cycling distance
cycling_data = [10.7, 0, None, 2.4, 15.3, 10.9, 0, None]

# Create a tuple of data
joined_data = list(zip(step_data, cycling_data))

# The dataframe
activity_df = pd.DataFrame(joined_data)

print(activity_df)

      0     1
0  3620  10.7
1  7891   0.0
2  9761   NaN
3  3907   2.4
4  4338  15.3
5  5373  10.9


Labeled columns and index can be added

In [23]:
# Add column names to dataframe
activity_df = pd.DataFrame(joined_data, 
                          index=pd.date_range('20150329', periods=6),
                          columns=['Walking', 'Cycling'])

print(activity_df)

            Walking  Cycling
2015-03-29     3620     10.7
2015-03-30     7891      0.0
2015-03-31     9761      NaN
2015-04-01     3907      2.4
2015-04-02     4338     15.3
2015-04-03     5373     10.9


# Indexing DataFrame rows

DataFrame rows can be indexed by row using the `loc` and `iloc` methods

In [25]:
# Select row of data by index name
print(activity_df.loc['20150401'])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


In [26]:
# Select row of data by integer position
print(activity_df.iloc[-3])

Walking    3907.0
Cycling       2.4
Name: 2015-04-01 00:00:00, dtype: float64


# Indexing DataFrame columns

DataFrame columns can be indexed by name

In [27]:
# Name of column
print(activity_df['Walking'])

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


DataFrame column can also be indexed as properties

In [28]:
# Object-oriented approach
print(activity_df.Walking)

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


DataFrame column can also be indexed by integer

In [30]:
# First column
print(activity_df.iloc[:,0])

2015-03-29    3620
2015-03-30    7891
2015-03-31    9761
2015-04-01    3907
2015-04-02    4338
2015-04-03    5373
Freq: D, Name: Walking, dtype: int64


# Reading data with Pandas

CSV and other common filetypes can be read with a single command

In [31]:
# The location of the data file
filepath = './Intel-ML101_Class1/data/Iris_Data.csv'

# Import the data
data = pd.read_csv(filepath)

data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Assigning new data to a DataFrame

Data can be (re)assigned to a DataFrame column

In [33]:
# Create a new column that is a product of both measurements
data['sepal_area'] = data.sepal_length * data.sepal_width

data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85
1,4.9,3.0,1.4,0.2,Iris-setosa,14.7
2,4.7,3.2,1.3,0.2,Iris-setosa,15.04
3,4.6,3.1,1.5,0.2,Iris-setosa,14.26
4,5.0,3.6,1.4,0.2,Iris-setosa,18.0


# Applying a function to a DataFrame column

Functions can be applied to columns or rows of a DataFrame or Series

In [34]:
# The lambda function applies what follows it to each row of data
data['abbrev'] = (data.species.apply(lambda x: x.replace('Iris-', '')))

data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85,setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,14.7,setosa
2,4.7,3.2,1.3,0.2,Iris-setosa,15.04,setosa
3,4.6,3.1,1.5,0.2,Iris-setosa,14.26,setosa
4,5.0,3.6,1.4,0.2,Iris-setosa,18.0,setosa


# Concatenating two DataFrames

Two DataFrames can be concatenated along either dimension

In [35]:
# Concatenate the first two and last two rows
small_data = pd.concat([data.iloc[:2], data.iloc[-2:]])

small_data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,abbrev
0,5.1,3.5,1.4,0.2,Iris-setosa,17.85,setosa
1,4.9,3.0,1.4,0.2,Iris-setosa,14.7,setosa
148,6.2,3.4,5.4,2.3,Iris-virginica,21.08,virginica
149,5.9,3.0,5.1,1.8,Iris-virginica,17.7,virginica


# Aggregated statistics with GroupBy

Using the `groupby` method calculated aggregated DataFrame statistics

In [37]:
# Use the size method with a DataFrame to get count
# For a Series, use the .values_counts method
group_sizes = data.groupby('species').size()

group_sizes

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
dtype: int64

# Performing Statistical Calculation

Pandas contains a variety of statistical methods 

In [39]:
# Mean calculated on a DataFrame
print(data.mean())

sepal_length     5.843333
sepal_width      3.054000
petal_length     3.758667
petal_width      1.198667
sepal_area      17.806533
dtype: float64


In [40]:
print(data.petal_length.median())

4.35


In [41]:
print(data.petal_length.mode())

0    1.5
dtype: float64


In [42]:
print(data.petal_length.std(),
     data.petal_length.var(),
     data.petal_length.sem(),)

1.7644204199522617 3.1131794183445156 0.144064324021


In [46]:
print(data.quantile(0))

sepal_length     4.3
sepal_width      2.0
petal_length     1.0
petal_width      0.1
sepal_area      10.0
Name: 0, dtype: float64

In [45]:
(data.describe())

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,sepal_area
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667,17.806533
std,0.828066,0.433594,1.76442,0.763161,3.368693
min,4.3,2.0,1.0,0.1,10.0
25%,5.1,2.8,1.6,0.3,15.645
50%,5.8,3.0,4.35,1.3,17.66
75%,6.4,3.3,5.1,1.8,20.325
max,7.9,4.4,6.9,2.5,30.02


# Sampling from DataFrames

DataFrames can be randomly sampled from

In [50]:
# Sample 5 rows without replacement
sample = data.sample(n=5, replace=False, random_state=42)

sample.iloc[:, -3:]

Unnamed: 0,species,sepal_area,abbrev
73,Iris-versicolor,17.08,versicolor
18,Iris-setosa,21.66,setosa
118,Iris-virginica,20.02,virginica
78,Iris-versicolor,17.4,versicolor
76,Iris-versicolor,19.04,versicolor
