### Pandas

In [42]:
import pandas as pd
import numpy as np

In [3]:
iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data')

In [4]:
iris

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
type(iris)

pandas.core.frame.DataFrame

In [6]:
# make a copy of iris dataframe
df = iris.copy()

In [7]:
df.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


#### Manipulating Data in Dataframe

In [12]:
df.columns = ['sl', 'sw', 'pl', 'pw', 'flower_type']
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [13]:
df.shape, df.dtypes

((149, 5),
 sl             float64
 sw             float64
 pl             float64
 pw             float64
 flower_type     object
 dtype: object)

In [14]:
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [17]:
# to access a particular column - df.column_name  or df[sl]
df.sl, df['sl']

(0      4.9
 1      4.7
 2      4.6
 3      5.0
 4      5.4
       ... 
 144    6.7
 145    6.3
 146    6.5
 147    6.2
 148    5.9
 Name: sl, Length: 149, dtype: float64,
 0      4.9
 1      4.7
 2      4.6
 3      5.0
 4      5.4
       ... 
 144    6.7
 145    6.3
 146    6.5
 147    6.2
 148    5.9
 Name: sl, Length: 149, dtype: float64)

In [19]:
# column wise null entries
df.isnull().sum() 

sl             0
sw             0
pl             0
pw             0
flower_type    0
dtype: int64

In [20]:
# to access the data from somwhere in between - iloc
df.iloc[1:4, 2:4]

Unnamed: 0,pl,pw
1,1.3,0.2
2,1.5,0.2
3,1.4,0.2


In [24]:
# drop row by label
df.drop(0, inplace = False)
df.head()

KeyError: '[0] not found in axis'

In [26]:
df.index

RangeIndex(start=1, stop=149, step=1)

In [27]:
# drop row by index
df.drop(df.index[0], inplace = True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa


In [28]:
df[df['flower_type'] == 'Iris-setosa'].describe()

Unnamed: 0,sl,sw,pl,pw
count,47.0,47.0,47.0,47.0
mean,5.012766,3.429787,1.470213,0.246809
std,0.360324,0.386689,0.176826,0.110042
min,4.3,2.3,1.0,0.1
25%,4.8,3.15,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.7,1.6,0.3
max,5.8,4.4,1.9,0.6


In [31]:
# iloc and loc
print(df.head())
print(df.iloc[0]) # acc to position
print(df.loc[2])  # acc to label

    sl   sw   pl   pw  flower_type
2  4.6  3.1  1.5  0.2  Iris-setosa
3  5.0  3.6  1.4  0.2  Iris-setosa
4  5.4  3.9  1.7  0.4  Iris-setosa
5  4.6  3.4  1.4  0.3  Iris-setosa
6  5.0  3.4  1.5  0.2  Iris-setosa


(sl                     4.6
 sw                     3.1
 pl                     1.5
 pw                     0.2
 flower_type    Iris-setosa
 Name: 2, dtype: object,
 sl                     4.6
 sw                     3.1
 pl                     1.5
 pw                     0.2
 flower_type    Iris-setosa
 Name: 2, dtype: object)

In [32]:
# add a row
df.loc[0] = [1,2,3,4, 'Iris-setosa']
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica
0,1.0,2.0,3.0,4.0,Iris-setosa


In [33]:
# reset index
df.reset_index(drop = True, inplace = True)

In [34]:
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.6,3.1,1.5,0.2,Iris-setosa
1,5.0,3.6,1.4,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...,...
143,6.3,2.5,5.0,1.9,Iris-virginica
144,6.5,3.0,5.2,2.0,Iris-virginica
145,6.2,3.4,5.4,2.3,Iris-virginica
146,5.9,3.0,5.1,1.8,Iris-virginica


In [35]:
# delete column
df.drop('sl', axis = 1)


Unnamed: 0,sw,pl,pw,flower_type
0,3.1,1.5,0.2,Iris-setosa
1,3.6,1.4,0.2,Iris-setosa
2,3.9,1.7,0.4,Iris-setosa
3,3.4,1.4,0.3,Iris-setosa
4,3.4,1.5,0.2,Iris-setosa
...,...,...,...,...
143,2.5,5.0,1.9,Iris-virginica
144,3.0,5.2,2.0,Iris-virginica
145,3.4,5.4,2.3,Iris-virginica
146,3.0,5.1,1.8,Iris-virginica


In [36]:
# method 2
del df['sw']
df

Unnamed: 0,sl,pl,pw,flower_type
0,4.6,1.5,0.2,Iris-setosa
1,5.0,1.4,0.2,Iris-setosa
2,5.4,1.7,0.4,Iris-setosa
3,4.6,1.4,0.3,Iris-setosa
4,5.0,1.5,0.2,Iris-setosa
...,...,...,...,...
143,6.3,5.0,1.9,Iris-virginica
144,6.5,5.2,2.0,Iris-virginica
145,6.2,5.4,2.3,Iris-virginica
146,5.9,5.1,1.8,Iris-virginica


In [39]:
df = iris.copy()
df.columns = ['sl', 'sw', 'pl', 'pw', 'flower_type']
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica
148,5.9,3.0,5.1,1.8,Iris-virginica


In [40]:
# add column
df["diff_pl_pw"] = df["pl"]- df["pw"]
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
144,6.7,3.0,5.2,2.3,Iris-virginica,2.9
145,6.3,2.5,5.0,1.9,Iris-virginica,3.1
146,6.5,3.0,5.2,2.0,Iris-virginica,3.2
147,6.2,3.4,5.4,2.3,Iris-virginica,3.1
148,5.9,3.0,5.1,1.8,Iris-virginica,3.3


#### Handling NAN

In [43]:
# making some NAN entries for demo
df.iloc[2:4, 1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,4.6,,,0.2,Iris-setosa,1.3
3,5.0,,,0.2,Iris-setosa,1.2
4,5.4,3.9,1.7,0.4,Iris-setosa,1.3


In [44]:
df.describe()

Unnamed: 0,sl,sw,pl,pw,diff_pl_pw
count,149.0,147.0,147.0,149.0,149.0
mean,5.848322,3.046939,3.806122,1.205369,2.569128
std,0.828594,0.434048,1.750351,0.761292,1.047707
min,4.3,2.0,1.0,0.1,0.8
25%,5.1,2.8,1.6,0.3,1.4
50%,5.8,3.0,4.4,1.3,2.9
75%,6.4,3.3,5.1,1.8,3.3
max,7.9,4.4,6.9,2.5,4.7


In [46]:
# drop NaN using dropna()
df.dropna(inplace = True)
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,5.4,3.9,1.7,0.4,Iris-setosa,1.3
3,4.6,3.4,1.4,0.3,Iris-setosa,1.1
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3


In [47]:
df.iloc[2:4, 1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,5.4,,,0.4,Iris-setosa,1.3
3,4.6,,,0.3,Iris-setosa,1.1
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3


In [50]:
# filling the NaN entry with the mean of that column 
df.sw.fillna(df.sw.mean(), inplace = True)
df.pl.fillna(df.pl.mean(), inplace = True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,5.4,3.038621,3.837241,0.4,Iris-setosa,1.3
3,4.6,3.038621,3.837241,0.3,Iris-setosa,1.1
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3


In [51]:
# better way -- filling the NaN entry with the mean of vlaues having that particular flower type
df.iloc[2:4, 1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,5.4,,,0.4,Iris-setosa,1.3
3,4.6,,,0.3,Iris-setosa,1.1
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3


In [53]:
df_setosa = df[df.flower_type == 'Iris-setosa']
df.sw.fillna(df_setosa.sw.mean(), inplace = True)
df.pl.fillna(df_setosa.pl.mean(), inplace = True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1
2,5.4,3.408889,1.462222,0.4,Iris-setosa,1.3
3,4.6,3.408889,1.462222,0.3,Iris-setosa,1.1
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3


#### String based data

In [61]:
df['Gender'] = "Female"
df.iloc[0:10, 6] = "Male"
df.head()

  df.iloc[0:10, 6] = "Male"


Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw,sex,Gender
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2,Male,Female
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1,Male,Female
2,5.4,3.408889,1.462222,0.4,Iris-setosa,1.3,Male,Female
3,4.6,3.408889,1.462222,0.3,Iris-setosa,1.1,Male,Female
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3,Male,Female


In [62]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw,sex,Gender
142,6.7,3.0,5.2,2.3,Iris-virginica,2.9,1,Female
143,6.3,2.5,5.0,1.9,Iris-virginica,3.1,1,Female
144,6.5,3.0,5.2,2.0,Iris-virginica,3.2,1,Female
145,6.2,3.4,5.4,2.3,Iris-virginica,3.1,1,Female
146,5.9,3.0,5.1,1.8,Iris-virginica,3.3,1,Female


In [63]:
def f(s):
    if s=='Male':
        return 0
    else:
        return 1
df['sex'] = df['Gender'].apply(f)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw,sex,Gender
0,4.9,3.0,1.4,0.2,Iris-setosa,1.2,1,Female
1,4.7,3.2,1.3,0.2,Iris-setosa,1.1,1,Female
2,5.4,3.408889,1.462222,0.4,Iris-setosa,1.3,1,Female
3,4.6,3.408889,1.462222,0.3,Iris-setosa,1.1,1,Female
4,5.0,3.4,1.5,0.2,Iris-setosa,1.3,1,Female


In [64]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,flower_type,diff_pl_pw,sex,Gender
142,6.7,3.0,5.2,2.3,Iris-virginica,2.9,1,Female
143,6.3,2.5,5.0,1.9,Iris-virginica,3.1,1,Female
144,6.5,3.0,5.2,2.0,Iris-virginica,3.2,1,Female
145,6.2,3.4,5.4,2.3,Iris-virginica,3.1,1,Female
146,5.9,3.0,5.1,1.8,Iris-virginica,3.3,1,Female
