# Pandas DataFrame Series exercise

In [2]:
import pandas as pd
import numpy as np

In [15]:
data = pd.DataFrame([{'patient': 1, 'phylum': 'Firmicutes', 'value': 632},
                    {'patient': 1, 'phylum': 'Proteobacteria', 'value': 1638},
                    {'patient': 1, 'phylum': 'Actinobacteria', 'value': 569},
                    {'patient': 1, 'phylum': 'Bacteroidetes', 'value': 115},
                    {'patient': 2, 'phylum': 'Firmicutes', 'value': 433},
                    {'patient': 2, 'phylum': 'Proteobacteria', 'value': 1130},
                    {'patient': 2, 'phylum': 'Actinobacteria', 'value': 754},
                    {'patient': 2, 'phylum': 'Bacteroidetes', 'value': 555}])

data

Unnamed: 0,patient,phylum,value
0,1,Firmicutes,632
1,1,Proteobacteria,1638
2,1,Actinobacteria,569
3,1,Bacteroidetes,115
4,2,Firmicutes,433
5,2,Proteobacteria,1130
6,2,Actinobacteria,754
7,2,Bacteroidetes,555


In [5]:
# without paranthesis 
data[data.phylum.str.endswith('bacteria') & data.value>1000]

Unnamed: 0,patient,phylum,value


In [6]:
#with paranthesis
data[data.phylum.str.endswith('bacteria') & (data.value>1000)]

Unnamed: 0,patient,phylum,value
1,1,Proteobacteria,1638
5,2,Proteobacteria,1130


In [7]:
treatment=pd.Series([0]*4+[1]*2)
treatment

0    0
1    0
2    0
3    0
4    1
5    1
dtype: int64

In [8]:
# new series values are added according to the index.
data['treatment']=treatment
data

Unnamed: 0,patient,phylum,value,treatment
0,1,Firmicutes,632,0.0
1,1,Proteobacteria,1638,0.0
2,1,Actinobacteria,569,0.0
3,1,Bacteroidetes,115,0.0
4,2,Firmicutes,433,1.0
5,2,Proteobacteria,1130,1.0
6,2,Actinobacteria,754,
7,2,Bacteroidetes,555,


In [9]:
len(data)

8

In [10]:
data['month']=['Jan','Feb','Mar','Apr']


ValueError: Length of values does not match length of index

In [12]:
data['month']=['Jan']*len(data)
data

Unnamed: 0,patient,phylum,value,treatment,month
0,1,Firmicutes,632,0.0,Jan
1,1,Proteobacteria,1638,0.0,Jan
2,1,Actinobacteria,569,0.0,Jan
3,1,Bacteroidetes,115,0.0,Jan
4,2,Firmicutes,433,1.0,Jan
5,2,Proteobacteria,1130,1.0,Jan
6,2,Actinobacteria,754,,Jan
7,2,Bacteroidetes,555,,Jan


In [13]:
# drop removes rows or columns, by default it will remove row. using axis, we can mention row or column to be removed.
data.drop('month',axis=1,inplace=True)
data

Unnamed: 0,patient,phylum,value,treatment
0,1,Firmicutes,632,0.0
1,1,Proteobacteria,1638,0.0
2,1,Actinobacteria,569,0.0
3,1,Bacteroidetes,115,0.0
4,2,Firmicutes,433,1.0
5,2,Proteobacteria,1130,1.0
6,2,Actinobacteria,754,
7,2,Bacteroidetes,555,


In [15]:
data.values

array([[1, 'Firmicutes', 632, 0.0],
       [1, 'Proteobacteria', 1638, 0.0],
       [1, 'Actinobacteria', 569, 0.0],
       [1, 'Bacteroidetes', 115, 0.0],
       [2, 'Firmicutes', 433, 1.0],
       [2, 'Proteobacteria', 1130, 1.0],
       [2, 'Actinobacteria', 754, nan],
       [2, 'Bacteroidetes', 555, nan]], dtype=object)

In [11]:
df = pd.DataFrame({'foo': [12,25,36], 'bar':[0.4, -1.0, 4.5]})
df.values

array([[12. ,  0.4],
       [25. , -1. ],
       [36. ,  4.5]])

In [13]:
df.dtypes

foo      int64
bar    float64
dtype: object

In [14]:
df

Unnamed: 0,foo,bar
0,12,0.4
1,25,-1.0
2,36,4.5


In [16]:
data.index

RangeIndex(start=0, stop=8, step=1)

In [27]:
# index objects are immutable.
data.index[1]=15

TypeError: Index does not support mutable operations

In [28]:
bacteria = pd.Series([632, 1638, 569, 115], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria', 'Bacteroidetes'])

bacteria

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
Bacteroidetes      115
dtype: int64

In [36]:
bacteria_dict = {'Proteobacteria': 1638, 'Actinobacteria': 569,'Firmicutes': 632, 
                 'Bacteroidetes': 115}
bacteria2 = pd.Series(bacteria_dict, 
                      index=['Bacteroidetes','Firmicutes',
                             'Proteobacteria','Actinobacteria'])
bacteria2

Bacteroidetes      115
Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
dtype: int64

In [37]:
bacteria_dict = {'Proteobacteria': 1638, 'Actinobacteria': 569,'Firmicutes': 632, 
                 'Bacteroidetes': 115}
bacteria2 = pd.Series(bacteria_dict, 
                      index=['Cyanobacteria','Firmicutes',
                             'Proteobacteria','Actinobacteria'])
bacteria2

Cyanobacteria        NaN
Firmicutes         632.0
Proteobacteria    1638.0
Actinobacteria     569.0
dtype: float64

In [38]:
bacteria2.index=bacteria.index
bacteria2

Firmicutes           NaN
Proteobacteria     632.0
Actinobacteria    1638.0
Bacteroidetes      569.0
dtype: float64

In [44]:
bacteria_dict = {'Proteobacteria': 1638, 'Actinobacteria': 569,'Firmicutes': 632, 
                 'Bacteroidetes': 115}
bacteria2 = pd.Series(bacteria_dict, 
                      index=['Cyanobacteria','Firmicutes',
                             'Proteobacteria','Actinobacteria'])
bacteria2

Cyanobacteria        NaN
Firmicutes         632.0
Proteobacteria    1638.0
Actinobacteria     569.0
dtype: float64

In [46]:
bacteria = pd.Series([632, 1638, 569], 
    index=['Firmicutes', 'Proteobacteria', 'Actinobacteria'])

bacteria

Firmicutes         632
Proteobacteria    1638
Actinobacteria     569
dtype: int64

In [47]:
# ValueError because, len of series differs.
bacteria2.index=bacteria.index

ValueError: Length mismatch: Expected axis has 4 elements, new values have 3 elements