## 2.1 Pandas Series

In [2]:
import pandas as pd
%matplotlib inline

In [3]:
scientists = pd.DataFrame(

    data={'Occupation': ['Chemist', 'Statistician'],

          'Born': ['1920-07-25', '1876-06-13'],

          'Died': ['1958-04-16', '1937-10-16'],

    'Age': [37, 61]},

    index=['Rosaline Franklin', 'William Gosset'],

    columns=['Occupation', 'Born', 'Died', 'Age'])

scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [4]:
first_row = scientists.loc["William Gosset"]
first_row

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object

In [5]:
first_row.index

# or using a method
first_row.keys()

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

In [6]:
first_row.values

array(['Statistician', '1876-06-13', '1937-10-16', 61], dtype=object)

`Pandas Series` are similar to `numpy.ndarray`

In [7]:
age = scientists["Age"]

In [8]:
age.mean()
age.min()
age.max()
age.std()
age.describe()

49.0

37

61

16.970562748477139

count     2.000000
mean     49.000000
std      16.970563
min      37.000000
25%      43.000000
50%      49.000000
75%      55.000000
max      61.000000
Name: Age, dtype: float64

### 2.1.1 Boolean Indexing

In [9]:
scientists = pd.read_csv('../data/scientists.csv')

In [10]:
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [11]:
ages = scientists['Age']
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [12]:
ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

### 2.1.2 Vector Operations
Operations Are Automatically Aligned and Vectorized (Broadcasting)

In [13]:
ages + ages

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [14]:
ages * ages

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

In [15]:
# ages is still the same
ages

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

#### 2.1.2.1 Vectors and Scalars

In [16]:
ages + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

#### 2.1.2.2 Vectors with Different Lengths
Broadcasting will be done if both are Pandas Series

In [18]:
ages + pd.Series([1, 100])

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

In [19]:
import numpy as np

In [20]:
ages + np.array([1, 100])

ValueError: operands could not be broadcast together with shapes (8,) (2,) 

### 2.1.3 Automatic Alignment

In [22]:
rev_ages = ages.sort_index(ascending=False)
rev_ages

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [23]:
# if you add ages and rev_ages, the vectors will be aligned first before the operation is carried out.
ages + rev_ages

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [24]:
# same with 
ages * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

## 2.2 Pandas DataFrames
Same concepts and operations applies as with the Pandas Series

### 2.2.1 Adding and Manipulating Columns

In [25]:
scientists.dtypes

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object

In [26]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [27]:
born_dt = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
died_dt = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
scientists['born_dt'], scientists['died_dt'] = (born_dt, died_dt)

scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14


In [28]:
scientists.dtypes

Name                  object
Born                  object
Died                  object
Age                    int64
Occupation            object
born_dt       datetime64[ns]
died_dt       datetime64[ns]
dtype: object

In [29]:
scientists['age_days_dt'] = (scientists['died_dt'] - scientists['born_dt'])
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days


In [30]:
# convert days to years
scientists['age_years_dt'] = scientists['age_days_dt'].astype('timedelta64[Y]')
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation,born_dt,died_dt,age_days_dt,age_years_dt
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days,37.0
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days,61.0
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days,90.0
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days,66.0
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days,56.0


### 2.2.2 Dropping Columns

In [31]:
scientists = scientists.drop(['Born'], axis=1)
scientists = scientists.drop(['Died'], axis=1)


# columns after dropping our column
print(scientists.columns)

Index(['Name', 'Age', 'Occupation', 'born_dt', 'died_dt', 'age_days_dt',
       'age_years_dt'],
      dtype='object')


In [32]:
scientists.head()

Unnamed: 0,Name,Age,Occupation,born_dt,died_dt,age_days_dt,age_years_dt
0,Rosaline Franklin,37,Chemist,1920-07-25,1958-04-16,13779 days,37.0
1,William Gosset,61,Statistician,1876-06-13,1937-10-16,22404 days,61.0
2,Florence Nightingale,90,Nurse,1820-05-12,1910-08-13,32964 days,90.0
3,Marie Curie,66,Chemist,1867-11-07,1934-07-04,24345 days,66.0
4,Rachel Carson,56,Biologist,1907-05-27,1964-04-14,20777 days,56.0


## 2.3 Exporting and Importing Data

In [37]:
scientists.to_pickle('../output/scientists.pickle')

In [38]:
scientists_from_pickle = pd.read_pickle('../output/scientists.pickle')
scientists_from_pickle.head()

Unnamed: 0,Name,Age,Occupation,born_dt,died_dt,age_days_dt,age_years_dt
0,Rosaline Franklin,37,Chemist,1920-07-25,1958-04-16,13779 days,37.0
1,William Gosset,61,Statistician,1876-06-13,1937-10-16,22404 days,61.0
2,Florence Nightingale,90,Nurse,1820-05-12,1910-08-13,32964 days,90.0
3,Marie Curie,66,Chemist,1867-11-07,1934-07-04,24345 days,66.0
4,Rachel Carson,56,Biologist,1907-05-27,1964-04-14,20777 days,56.0


### 2.3.1 To CSV

In [40]:
scientists.to_csv('../output/scientists.csv')
scientists.to_csv('../output/scientists.tsv', sep='\t')

In [41]:
# removing row numbers from output
scientists.to_csv('../output/scientists_no_index.csv', index=False)

In [42]:
scientists_from_csv = pd.read_csv('../output/scientists_no_index.csv')
scientists_from_csv.head()

Unnamed: 0,Name,Age,Occupation,born_dt,died_dt,age_days_dt,age_years_dt
0,Rosaline Franklin,37,Chemist,1920-07-25,1958-04-16,13779 days 00:00:00.000000000,37.0
1,William Gosset,61,Statistician,1876-06-13,1937-10-16,22404 days 00:00:00.000000000,61.0
2,Florence Nightingale,90,Nurse,1820-05-12,1910-08-13,32964 days 00:00:00.000000000,90.0
3,Marie Curie,66,Chemist,1867-11-07,1934-07-04,24345 days 00:00:00.000000000,66.0
4,Rachel Carson,56,Biologist,1907-05-27,1964-04-14,20777 days 00:00:00.000000000,56.0


### 2.3.2 To Excel

In [44]:
scientists.to_excel('../output/scientists_df.xlsx',sheet_name='scientists',index=False)

In [46]:
# Series does not have to_excel method, convert it first to DataFrame
age = scientists["Age"]
type(age)

pandas.core.series.Series

In [47]:
age_df = age.to_frame()
age_df.to_excel('../output/age_df.xlsx',sheet_name='age',index=False)