## Introduction to pandas

Library for computation with 
tabular 
data
 Mixed types of data allowed in a single 
table
 Columns and rows of data can be named
 Advanced data aggregation and 
statistical functions

In [0]:
import pandas as pd
import numpy as np


#basic import for pandas

# **Series¶**
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. The basic method to create a Series is to call:

In [0]:

#>>> s = pd.Series(data, index=index)

#Here, data can be many different things:

# a Python dict
# an ndarray
# a scalar value (like 5)

s = pd.Series([1,3,5,np.nan,6,8])
print(s)


0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


**Dataframe**
class pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)




Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure.

Parameters:	
data : numpy ndarray (structured or homogeneous), dict, or DataFrame

Dict can contain Series, arrays, constants, or list-like objects

Changed in version 0.23.0: If data is a dict, argument order is maintained for Python 3.6 and later.

index : Index or array-like

Index to use for resulting frame. Will default to RangeIndex if no indexing information part of input data and no index provided

columns : Index or array-like

Column labels to use for resulting frame. Will default to RangeIndex (0, 1, 2, …, n) if no column labels are provided

dtype : dtype, default None

Data type to force. Only a single dtype is allowed. If None, infer

copy : boolean, default False

Copy data from inputs. Only affects DataFrame / 2d ndarray input

now lets create a dataframe using numpy array

In [0]:
dates = pd.date_range('19991123', periods=18)
dates


DatetimeIndex(['1999-11-23', '1999-11-24', '1999-11-25', '1999-11-26',
               '1999-11-27', '1999-11-28', '1999-11-29', '1999-11-30',
               '1999-12-01', '1999-12-02', '1999-12-03', '1999-12-04',
               '1999-12-05', '1999-12-06', '1999-12-07', '1999-12-08',
               '1999-12-09', '1999-12-10'],
              dtype='datetime64[ns]', freq='D')

In [0]:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),                columns=['a', 'b', 'c', 'd', 'e'])

#using numpy random

df2

Unnamed: 0,a,b,c,d,e
0,7,9,4,9,3
1,5,3,9,5,9
2,9,8,2,9,4
3,4,0,8,7,5
4,3,1,8,3,5


In [0]:
ash = pd.DataFrame(np.random.randn(6,4), columns=list('ABCD'))
print(ash)
print()
ash.dtypes

          A         B         C         D
0  1.410838 -0.968795 -0.542472  1.834626
1 -1.467384  0.006920 -2.051159  1.101096
2 -0.244505  1.597878 -0.751972  0.459436
3  0.257834  0.847664 -1.558660  1.331092
4 -0.610350 -0.029297  0.454771 -0.132607
5  0.876225 -1.838081 -0.155802 -1.690631



A    float64
B    float64
C    float64
D    float64
dtype: object

In [0]:
ash= pd.DataFrame({ 'A' : 1.,
   ....:                      'B' : pd.Timestamp('19991123'),
   ....:                      'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
   ....:                      'D' : np.array([3] * 4 ,dtype='int32'),
   ....:                      'E' : pd.Categorical(["test","train","ash","train"]),
   ....:                      'F' : 'foo' })
   ....: 

ash

Unnamed: 0,A,B,C,D,E,F
0,1.0,1999-11-23,1.0,3,test,foo
1,1.0,1999-11-23,1.0,3,train,foo
2,1.0,1999-11-23,1.0,3,ash,foo
3,1.0,1999-11-23,1.0,3,train,foo


# **viewing data**

In [0]:
ash.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,1999-11-23,1.0,3,test,foo
1,1.0,1999-11-23,1.0,3,train,foo
2,1.0,1999-11-23,1.0,3,ash,foo
3,1.0,1999-11-23,1.0,3,train,foo


In [0]:
ash.tail(3)

Unnamed: 0,A,B,C,D,E,F
1,1.0,1999-11-23,1.0,3,train,foo
2,1.0,1999-11-23,1.0,3,ash,foo
3,1.0,1999-11-23,1.0,3,train,foo


# displaying under lying numpy data

In [0]:
ash.index


Int64Index([0, 1, 2, 3], dtype='int64')

In [0]:
ash.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [0]:
ash.values

array([[1.0, Timestamp('1999-11-23 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('1999-11-23 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('1999-11-23 00:00:00'), 1.0, 3, 'ash', 'foo'],
       [1.0, Timestamp('1999-11-23 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [0]:
ash.describe()

#shows quick static summary


Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


# transposing your data

In [0]:
ash.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,1999-11-23 00:00:00,1999-11-23 00:00:00,1999-11-23 00:00:00,1999-11-23 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,ash,train
F,foo,foo,foo,foo


# sorting by an axis

In [0]:
ash.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D,E,F
0,1.0,1999-11-23,1.0,3,test,foo
1,1.0,1999-11-23,1.0,3,train,foo
2,1.0,1999-11-23,1.0,3,ash,foo
3,1.0,1999-11-23,1.0,3,train,foo


# sorting by values



In [0]:
ash.sort_values(by="E")

Unnamed: 0,A,B,C,D,E,F
2,1.0,1999-11-23,1.0,3,ash,foo
0,1.0,1999-11-23,1.0,3,test,foo
1,1.0,1999-11-23,1.0,3,train,foo
3,1.0,1999-11-23,1.0,3,train,foo


# ***selection***

getting



In [0]:
ash['E']

#equal to ash.E

0     test
1    train
2      ash
3    train
Name: E, dtype: category
Categories (3, object): [ash, test, train]

In [0]:
ash[0:3]

Unnamed: 0,A,B,C,D,E,F
0,1.0,1999-11-23,1.0,3,test,foo
1,1.0,1999-11-23,1.0,3,train,foo
2,1.0,1999-11-23,1.0,3,ash,foo


In [0]:
#selection by label


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt



s = pd.Series([1,3,5,np.nan,6,8])
#series

dates = pd.date_range('20130101', periods=6)
#dataframe series

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

#dataframes using numpy array

df


Unnamed: 0,A,B,C,D
2013-01-01,-1.444269,1.21949,0.604292,0.238182
2013-01-02,-1.497831,-0.30701,0.321746,3.158982
2013-01-03,-1.384415,0.96629,1.635222,0.787217
2013-01-04,-0.708264,-0.544316,-0.007,0.180916
2013-01-05,-0.390658,0.171823,0.015738,-0.022705
2013-01-06,-2.304853,-1.152578,1.823294,-1.517201


In [0]:
df.loc[dates[1]]

A   -1.497831
B   -0.307010
C    0.321746
D    3.158982
Name: 2013-01-02 00:00:00, dtype: float64

In [0]:
#multiple access by label

df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-1.444269,1.21949
2013-01-02,-1.497831,-0.30701
2013-01-03,-1.384415,0.96629
2013-01-04,-0.708264,-0.544316
2013-01-05,-0.390658,0.171823
2013-01-06,-2.304853,-1.152578
