# Imports

In [1]:
import pandas as pd
import time

# Creating DataFrames

## Creating a DataFrame from a local CSV

In [2]:
irisDF = pd.read_csv('data/iris/iris_data.csv')
irisDF.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,types
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Creating a DataFrame from a CSV online

In [3]:
iris2DF = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
iris2DF.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Creating a DataFrame from a Python dictionary

In [4]:
data_dict = {
    'Name': ['Timo', 'William', 'Yohann'],
    'IQ': [125, 100, 65],
    'Number of friends': [10, 20, 0]
}
teacherDF = pd.DataFrame(data_dict)
teacherDF.head()

Unnamed: 0,Name,IQ,Number of friends
0,Timo,125,10
1,William,100,20
2,Yohann,65,0


## Creating a DataFrame from a list of Python dictionaries

In [5]:
timo_dict = {'Name': 'Timo',    'IQ': 125, 'Number of friends': 10}
will_dict = {'Name': 'William', 'IQ': 100,  'Number of friends': 20}
yohi_dict = {'Name': 'Yohann',  'IQ': 65, 'Number of friends': 0}
data_list = [timo_dict, will_dict, yohi_dict]

teacherDF = pd.DataFrame(data_list)
teacherDF.head()

Unnamed: 0,Name,IQ,Number of friends
0,Timo,125,10
1,William,100,20
2,Yohann,65,0


# Indexing DataFrames

## Indexing columns

In [6]:
names = teacherDF.Name # This is one possible syntax
names

0       Timo
1    William
2     Yohann
Name: Name, dtype: object

In [7]:
names = teacherDF['Name'] # This is the preferred syntax
names

0       Timo
1    William
2     Yohann
Name: Name, dtype: object

## Indexing rows

In [8]:
timo_series = teacherDF.loc[0]
timo_series

Name                 Timo
IQ                    125
Number of friends      10
Name: 0, dtype: object

## Indexing rows and columns

In [9]:
lowest_friend_count = teacherDF['Number of friends'].loc[2]
lowest_friend_count

0

In [10]:
lowest_friend_count = teacherDF.loc[2]['Number of friends']
lowest_friend_count

0

In [11]:
# We recall that Yohann actually has one friend.
# When setting individual values you have to be more specific with syntax
teacherDF.loc[2, 'Number of friends'] = 1
teacherDF.head()

Unnamed: 0,Name,IQ,Number of friends
0,Timo,125,10
1,William,100,20
2,Yohann,65,1


# Vectorized operations

Let us say that we want to create a new column in the irisDF dataset with sepal_length in inces. This can be done with vectorized operations, or with loops. 

In [12]:
print(irisDF.shape)
irisDF.head()

(150, 5)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,types
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
iris1DF = irisDF.copy()
iris2DF = irisDF.copy()

num_entries = irisDF.shape[0]

In [14]:
# non-vectorized
iris1DF['sepal_length_inches'] = 0.0

start_time = time.time()
for idx in range(num_entries):
    iris1DF.loc[idx, 'sepal_length_inches'] = iris1DF.loc[idx, 'sepal_length'] * 0.393701
non_vect_op_time = time.time() - start_time

In [15]:
# vectorized
start_time = time.time()
iris2DF['sepal_length_inches'] = iris1DF['sepal_length'] * 0.393701
vect_op_time = time.time() - start_time

In [16]:
print(f'It takes {non_vect_op_time:.5f} sec to update {num_entries} entries using non-vectorized operations')
print(f'It takes {vect_op_time:.5f} sec to update {num_entries} entries using vectorized operations')
print(f'Vectorized operations are {non_vect_op_time/vect_op_time:.1f} times faster')

It takes 0.02294 sec to update 150 entries using non-vectorized operations
It takes 0.00102 sec to update 150 entries using vectorized operations
Vectorized operations are 22.4 times faster
