# Getting Started with pandas

In [1]:
import numpy as np
import pandas as pd
#from icecream import ic

# Introduction to Pandas Data Structures
Series

A Series is a one-dimensional array-like object containing a sequence of values (of similar types to NumPy types) and an associated array of data labels, called its index. The simplest Series is formed from only an array of data:


In [2]:
PyMarks=pd.Series([16,15,12,20],index=['Asavari','Rashmi','Manoj','Chetan'])
PyMarks['Manoj']

12

In [3]:
PyMarks.values

array([16, 15, 12, 20], dtype=int64)

In [4]:
PyMarks.index=['a','b','c','d']
PyMarks

a    16
b    15
c    12
d    20
dtype: int64

In [5]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
obj2['a']

-5

In [7]:
obj2[1]

7

In [8]:
PyMarks[PyMarks <= 16]

a    16
b    15
c    12
dtype: int64

In [9]:
np.log(PyMarks)

a    2.772589
b    2.708050
c    2.484907
d    2.995732
dtype: float64

In [10]:
Int1=pd.Series([10,20,30],index=["a","b","c"])
Int2=pd.Series([20,30,40],index=["a","b","d"])

In [11]:
Int=Int1+Int2
Int

a    30.0
b    50.0
c     NaN
d     NaN
dtype: float64

In [12]:
Int12=Int.isnull().sum()
Int12

2

Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping of index values to data values.

In [13]:
'a' in PyMarks

True

# Creating Series from Dictionary

In [14]:
sdata = {'Ohio': 35000, 
         'Texas': 71000, 
         'Oregon': 16000, 
         'Utah': 5000,
        'India': np.nan}
obj3 = pd.Series(sdata)
obj3

Ohio      35000.0
Texas     71000.0
Oregon    16000.0
Utah       5000.0
India         NaN
dtype: float64

In [15]:
obj3.index

Index(['Ohio', 'Texas', 'Oregon', 'Utah', 'India'], dtype='object')

In [16]:
obj3.values

array([35000., 71000., 16000.,  5000.,    nan])

In [17]:
sindex=['Oregon', 'Utah','India','Ohio', 'Texas','Pakistan']   #Index Sequence Changed, Observe the mssing value
obj4=pd.Series(sdata,index=sindex)
obj4

Oregon      16000.0
Utah         5000.0
India           NaN
Ohio        35000.0
Texas       71000.0
Pakistan        NaN
dtype: float64

# Creating Series from Lists

In [18]:
obj5=pd.Series([12,14,16],index=["a","b","c"])
obj5

a    12
b    14
c    16
dtype: int64

In [19]:
obj6=pd.Series({"a":23,"b":34,"c":45,"d":34})
obj6

a    23
b    34
c    45
d    34
dtype: int64

In [20]:
obj5+obj6

a    35.0
b    48.0
c    61.0
d     NaN
dtype: float64

In [21]:
obj6-obj5

a    11.0
b    20.0
c    29.0
d     NaN
dtype: float64

# DataFrame

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dict of Series all sharing the same index.


In [22]:
# Constructing Data Frame using Dictionary

data = {'State': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'Year': [2000, 2001, 2002, 2001, 2002, 2003],
        'Pop': [1.5, 1.7, np.nan, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data,index=[1,2,3,4,5,6])
frame.tail(3)

Unnamed: 0,State,Year,Pop
4,Nevada,2001,2.4
5,Nevada,2002,2.9
6,Nevada,2003,3.2


In [23]:
# head method selects only the first two rows:
print(frame.head(2))

frame.tail(3)

  State  Year  Pop
1  Ohio  2000  1.5
2  Ohio  2001  1.7


Unnamed: 0,State,Year,Pop
4,Nevada,2001,2.4
5,Nevada,2002,2.9
6,Nevada,2003,3.2


In [24]:
# Rearrangement of Columns
pd.DataFrame(data, columns=['State', 'Pop','Year'])

Unnamed: 0,State,Pop,Year
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002
5,Nevada,3.2,2003


In [25]:
# If you pass a column that isn’t contained in the dict, it will appear with missing values in the result:
frame4=pd.DataFrame(data, columns=['Year', 'State', 'Pop','Extra'])
frame4

Unnamed: 0,Year,State,Pop,Extra
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [26]:
frame4["Extra"]=pd.Series([1,2,3,4,5,6],index=[2,3,1,0,4,5])
frame4["Extra2"]=frame4["Extra"]+frame4["Pop"]
frame4

Unnamed: 0,Year,State,Pop,Extra,Extra2
0,2000,Ohio,1.5,4,5.5
1,2001,Ohio,1.7,3,4.7
2,2002,Ohio,,1,
3,2001,Nevada,2.4,2,4.4
4,2002,Nevada,2.9,5,7.9
5,2003,Nevada,3.2,6,9.2


In [27]:
frame4.columns

Index(['Year', 'State', 'Pop', 'Extra', 'Extra2'], dtype='object')

In [28]:
frame4.index=["a","b","c","d","e","f"]
frame4.index

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [29]:
# index assignment
frame2=pd.DataFrame(data, columns=['Year', 'State', 'Pop'],index=["a","b","c","d","e","f"])
frame2

Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [30]:
frame2['State'][2]
frame2.iloc[:,1]

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [31]:
# Accessing Particular Column
frame2['State']

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [32]:
frame2

Unnamed: 0,Year,State,Pop
a,2000,Ohio,1.5
b,2001,Ohio,1.7
c,2002,Ohio,
d,2001,Nevada,2.4
e,2002,Nevada,2.9
f,2003,Nevada,3.2


In [33]:
# Accessing Particular Column using dot operator
frame2['State']

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: State, dtype: object

In [34]:
np.unique(frame2['State'])

array(['Nevada', 'Ohio'], dtype=object)

In [35]:
frame3=pd.DataFrame(data, columns=['Year', 'State', 'Pop',"Extra"])
frame3["Extra"]

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
Name: Extra, dtype: object

In [36]:
# When you are assigning lists or arrays to a column, the value’s length must match the length of the DataFrame. 
frame3["Extra"]=np.random.randn(6)
frame3

Unnamed: 0,Year,State,Pop,Extra
0,2000,Ohio,1.5,-0.444887
1,2001,Ohio,1.7,0.645
2,2002,Ohio,,-0.047732
3,2001,Nevada,2.4,0.044651
4,2002,Nevada,2.9,-0.622663
5,2003,Nevada,3.2,1.165313


In [37]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame’s index, inserting missing values in any holes:
val = pd.Series([-1.2, -1.5, -1.7], index=[0,2,4])
frame3["Extra"]=val
frame3

Unnamed: 0,Year,State,Pop,Extra
0,2000,Ohio,1.5,-1.2
1,2001,Ohio,1.7,
2,2002,Ohio,,-1.5
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,-1.7
5,2003,Nevada,3.2,


In [38]:
del frame3['Extra']
frame3

Unnamed: 0,Year,State,Pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [39]:
# nested dict of dicts
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame4=pd.DataFrame(pop)

# Outer dict keys as the columns and the inner keys as the row indices

frame4

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [40]:
# transpose the DataFrame
frame4.T# frame4.transpose()

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [41]:
frame4

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [42]:
frame4.index=[2001, 2002, 2003]
frame4

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,1.5


# Reindexing

An important method on pandas objects is reindex, which means to create a new object with the data conformed to a new index.


In [43]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [44]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [45]:
obj3.reindex([0,1,2,3,4,5], method='ffill')

obj4=pd.Series([21,15,16,48,45,np.nan])
obj4[obj4.isnull()]=np.mean(obj4)
obj4

0    21.0
1    15.0
2    16.0
3    48.0
4    45.0
5    29.0
dtype: float64

In [48]:
data2=np.arange(9).reshape((3, 3))+1

In [47]:
frame = pd.DataFrame(data2,
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,1,2,3
c,4,5,6
d,7,8,9


In [49]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,1.0,2.0,3.0
b,,,
c,4.0,5.0,6.0
d,7.0,8.0,9.0


In [50]:
data=np.arange(16).reshape(4,4)
data

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [51]:
frame6=pd.DataFrame(data,index=list(data[:,1]))
frame6

Unnamed: 0,0,1,2,3
1,0,1,2,3
5,4,5,6,7
9,8,9,10,11
13,12,13,14,15


In [52]:
import pandas as pd
import numpy as np

In [None]:
df=pd.read_excel('Covid-19 Vaccination.xlsx',sheet_name='Form responses 1')
df.columns

In [None]:
for i in np.unique(df['Are you Vaccinated for Covid-19?']):
    df_temp=df[df['Are you Vaccinated for Covid-19?']==i]
    df_temp.to_excel(i+'.xlsx')

In [None]:
df[['Name','Department','Are you Vaccinated for Covid-19?']].groupby(by=['Department','Are you Vaccinated for Covid-19?']).describe()

In [None]:
df2=pd.read_csv('CSVdata.csv')
df2.columns

In [None]:
#df2=df2.drop(['Unnamed: 0', 'EVENT', 'Course Part Name',      'Course Part Abbrevation', 'Course Code'],axis=1)
#df2.groupby('Course FULL Name').count()
len(np.unique(df2['Name OF Student']))

# Merge

Concat pandas provides various facilities for easily combining together Series and DataFrame objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations.


In [55]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,1.123345,-0.870833,-0.280479,-1.008835
1,0.030876,-1.062469,0.01543,-0.196544
2,-0.183791,0.614043,-1.510451,0.231379
3,-0.18933,0.087249,1.182741,-0.445013
4,1.601182,0.244557,0.797141,1.740396
5,-0.759531,0.986794,-0.358142,1.004673
6,-0.048081,-0.097549,-0.055199,-0.71746
7,0.652759,0.556606,-1.000533,-1.661549
8,0.281402,1.199861,1.141221,0.563425
9,0.641967,-1.883109,0.880479,0.690667


In [56]:
pieces = [df[:3],df[7:]]
pieces

[          0         1         2         3
 0  1.123345 -0.870833 -0.280479 -1.008835
 1  0.030876 -1.062469  0.015430 -0.196544
 2 -0.183791  0.614043 -1.510451  0.231379,
           0         1         2         3
 7  0.652759  0.556606 -1.000533 -1.661549
 8  0.281402  1.199861  1.141221  0.563425
 9  0.641967 -1.883109  0.880479  0.690667]

In [57]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,1.123345,-0.870833,-0.280479,-1.008835
1,0.030876,-1.062469,0.01543,-0.196544
2,-0.183791,0.614043,-1.510451,0.231379
7,0.652759,0.556606,-1.000533,-1.661549
8,0.281402,1.199861,1.141221,0.563425
9,0.641967,-1.883109,0.880479,0.690667


In [58]:
a=pd.DataFrame(np.random.randn(2, 5))
b=pd.DataFrame(np.random.randn(3, 4))
a

Unnamed: 0,0,1,2,3,4
0,1.342638,0.345126,-0.808634,1.195845,-0.896294
1,1.832085,2.416135,-0.424402,-0.286935,-0.371666


In [59]:
b

Unnamed: 0,0,1,2,3
0,1.171538,1.092299,-1.016255,-0.324167
1,-1.289965,-0.004741,1.780872,-1.110578
2,0.947254,0.762305,0.792661,-0.294705


In [60]:
pd.concat([a,b])

Unnamed: 0,0,1,2,3,4
0,1.342638,0.345126,-0.808634,1.195845,-0.896294
1,1.832085,2.416135,-0.424402,-0.286935,-0.371666
0,1.171538,1.092299,-1.016255,-0.324167,
1,-1.289965,-0.004741,1.780872,-1.110578,
2,0.947254,0.762305,0.792661,-0.294705,


# Join

In [61]:
left = pd.DataFrame({"Date":[1,2,3],
                     "B":[3,4,5]})
right = pd.DataFrame({"Date":[1,3,4],
                     "D":[3,4,6]})
print("Left\n",left)
print("Right\n",right)
pd.merge(left,right,on="Date")

Left
    Date  B
0     1  3
1     2  4
2     3  5
Right
    Date  D
0     1  3
1     3  4
2     4  6


Unnamed: 0,Date,B,D
0,1,3,3
1,3,5,4


# Dropping Entries from an Axis

The drop method will return a new object with the indicated value or values deleted from an axis:


In [62]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
new_obj = obj.drop(list('abd'))
print(new_obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
c    2.0
e    4.0
dtype: float64


In [63]:
obj = pd.DataFrame(np.random.randn(4,5),columns=['a', 'b', 'c', 'd', 'e'])
print(obj)
new_obj = obj.drop(list('abd'),axis=1)
new_obj

          a         b         c         d         e
0  0.848358  0.139956  0.244980  1.877608 -0.044773
1  0.865950 -0.474496  0.168236  0.482291 -0.938344
2 -0.825298  0.694410 -0.816539 -0.057093  0.297265
3  0.293559 -1.375797 -1.014076  0.412238 -0.654827


Unnamed: 0,c,e
0,0.24498,-0.044773
1,0.168236,-0.938344
2,-0.816539,0.297265
3,-1.014076,-0.654827


# Grouping

By “group by” we are referring to a process involving one or more of the following steps:

    Splitting the data into groups based on some criteria
    Applying a function to each group independently
    Combining the results into a data structure



In [64]:
df = pd.DataFrame(
   ....:     {
   ....:         "Gender": ["M", "F"]*4,
   ....:         "Smoke": ["Smoker", "NonSmoker", "NonSmoker", "Smoker"]*2,
   ....:         "ConcA": np.random.randint(1,10,8),
   ....:         "ConcB": np.random.randint(1,10,8)
   ....:     }
   ....: )
df

Unnamed: 0,Gender,Smoke,ConcA,ConcB
0,M,Smoker,1,3
1,F,NonSmoker,3,8
2,M,NonSmoker,7,6
3,F,Smoker,1,6
4,M,Smoker,4,3
5,F,NonSmoker,7,8
6,M,NonSmoker,8,6
7,F,Smoker,4,6


In [65]:
df.groupby(["Gender", "Smoke"]).var()

Unnamed: 0_level_0,Unnamed: 1_level_0,ConcA,ConcB
Gender,Smoke,Unnamed: 2_level_1,Unnamed: 3_level_1
F,NonSmoker,8.0,0.0
F,Smoker,4.5,0.0
M,NonSmoker,0.5,0.0
M,Smoker,4.5,0.0


In [66]:
df.groupby("Gender").mean()

Unnamed: 0_level_0,ConcA,ConcB
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,3.75,7.0
M,5.0,4.5


In [67]:
df.groupby(["Gender","Smoke"]).describe().T

Unnamed: 0_level_0,Gender,F,F,M,M
Unnamed: 0_level_1,Smoke,NonSmoker,Smoker,NonSmoker,Smoker
ConcA,count,2.0,2.0,2.0,2.0
ConcA,mean,5.0,2.5,7.5,2.5
ConcA,std,2.828427,2.12132,0.707107,2.12132
ConcA,min,3.0,1.0,7.0,1.0
ConcA,25%,4.0,1.75,7.25,1.75
ConcA,50%,5.0,2.5,7.5,2.5
ConcA,75%,6.0,3.25,7.75,3.25
ConcA,max,7.0,4.0,8.0,4.0
ConcB,count,2.0,2.0,2.0,2.0
ConcB,mean,8.0,6.0,6.0,3.0


In [68]:
df.groupby("Gender").describe().T

Unnamed: 0,Gender,F,M
ConcA,count,4.0,4.0
ConcA,mean,3.75,5.0
ConcA,std,2.5,3.162278
ConcA,min,1.0,1.0
ConcA,25%,2.5,3.25
ConcA,50%,3.5,5.5
ConcA,75%,4.75,7.25
ConcA,max,7.0,8.0
ConcB,count,4.0,4.0
ConcB,mean,7.0,4.5


In [69]:
df
df2=df.iloc[:,2:]
df2.apply(sum) # Column Sum

ConcA    35
ConcB    46
dtype: int64

In [70]:
df2.apply(np.mean,axis=1) # Row Sum

0    2.0
1    5.5
2    6.5
3    3.5
4    3.5
5    7.5
6    7.0
7    5.0
dtype: float64

In [71]:
f = lambda x: x.max() - x.min()

In [72]:
df2.apply(f,axis=1)

0    2
1    5
2    1
3    5
4    1
5    1
6    2
7    2
dtype: int64

In [73]:
def f2(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

In [74]:
print(df2.apply(f2,axis=1))
df2[["min","max"]]=df2.apply(f2,axis=1)
print(df2)

   min  max
0    1    3
1    3    8
2    6    7
3    1    6
4    3    4
5    7    8
6    6    8
7    4    6
   ConcA  ConcB  min  max
0      1      3    1    3
1      3      8    3    8
2      7      6    6    7
3      1      6    1    6
4      4      3    3    4
5      7      8    7    8
6      8      6    6    8
7      4      6    4    6


In [75]:
df2

Unnamed: 0,ConcA,ConcB,min,max
0,1,3,1,3
1,3,8,3,8
2,7,6,6,7
3,1,6,1,6
4,4,3,3,4
5,7,8,7,8
6,8,6,6,8
7,4,6,4,6


Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating-point value in frame. You can do this with apply map:

In [76]:
print(df2)
format = lambda x: x*x
df2.applymap(format)

   ConcA  ConcB  min  max
0      1      3    1    3
1      3      8    3    8
2      7      6    6    7
3      1      6    1    6
4      4      3    3    4
5      7      8    7    8
6      8      6    6    8
7      4      6    4    6


Unnamed: 0,ConcA,ConcB,min,max
0,1,9,1,9
1,9,64,9,64
2,49,36,36,49
3,1,36,1,36
4,16,9,9,16
5,49,64,49,64
6,64,36,36,64
7,16,36,16,36


The reason for the name applymap is that Series has a map method for applying an element-wise function:

In [77]:
df2['ConcA'].map(format)

0     1
1     9
2    49
3     1
4    16
5    49
6    64
7    16
Name: ConcA, dtype: int64

In [78]:
df

Unnamed: 0,Gender,Smoke,ConcA,ConcB
0,M,Smoker,1,3
1,F,NonSmoker,3,8
2,M,NonSmoker,7,6
3,F,Smoker,1,6
4,M,Smoker,4,3
5,F,NonSmoker,7,8
6,M,NonSmoker,8,6
7,F,Smoker,4,6


In [79]:
def fun(x):
    if x=="M":
        return "Male"
    elif x=="F":
        return "Female"

df["Gender2"]=df["Gender"].map(fun)
df

Unnamed: 0,Gender,Smoke,ConcA,ConcB,Gender2
0,M,Smoker,1,3,Male
1,F,NonSmoker,3,8,Female
2,M,NonSmoker,7,6,Male
3,F,Smoker,1,6,Female
4,M,Smoker,4,3,Male
5,F,NonSmoker,7,8,Female
6,M,NonSmoker,8,6,Male
7,F,Smoker,4,6,Female


# Sorting and Ranking

In [80]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [81]:
obj.sort_index(ascending=False)

d    0
c    3
b    2
a    1
dtype: int64

In [82]:
# To sort a Series by its values, use its sort_values method:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [83]:
# Any missing values are sorted to the end of the Series by default:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

When sorting a DataFrame, you can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the by option of sort_values:

In [84]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [85]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


Ranking assigns ranks from one through the number of valid data points in an array. The rank methods for Series and DataFrame are the place to look; by default rank breaks ties by assigning each group the mean rank:

In [86]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj.rank())

obj.rank(method="first")

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [87]:
#read_table
#read_excel
#read_html
#read_clipboard

In [None]:
data=pd.read_csv("amoxilin.csv",index_col=["Subject","Formulation","Period"])
data.head()

In [None]:
data.groupby(["Formulation"]).describe()

In [None]:
main_data=data.iloc[:,3:]
main_data.head()

In [None]:
data.groupby("Formulation").sum()