Pandas
------------
- Pandas is a built in library which is used for data analysis. Pandas are used for manipulation, visualization, building machine learning models, etc.

- Pandas implements a number of powerful data operation familiar to users of both database frameworks and spreadsheet programs.

- There are two main data structures in Pandas - Series, DataFrames. 


In [2]:
!pip install pandas



In [3]:
import pandas as pd

In [4]:
pd.__version__

'0.23.4'

### Series
- A series is similar to a 1-D numpy array, and contains values of the same type(intergers, strings,datetime,etc)
### DataFrames
- A DataFrame is simply a table where each column is a pandas series.
##### Creating Series
- List
- Tuple
- Dictionary
- Numpy
- Date_Range

In [5]:
k = [1,2,34,78,90,54,82]
p1 = pd.Series(k)
p1

0     1
1     2
2    34
3    78
4    90
5    54
6    82
dtype: int64

In [6]:
t = (12,34,56,78,90,120)
p2 = pd.Series(t)
p2

0     12
1     34
2     56
3     78
4     90
5    120
dtype: int64

In [9]:
d = {"A":67,"b":89,"c":"de"}
p3 = pd.Series(d)
p3

A    67
b    89
c    de
dtype: object

In [17]:
p2.index = ["ab",9,"34",61,23,13]

In [18]:
p2

ab     12
9      34
34     56
61     78
23     90
13    120
dtype: int64

In [12]:
p2[61]

78

In [19]:
p2[4]

90

In [21]:
p1

0     1
1     2
2    34
3    78
4    90
5    54
6    82
dtype: int64

In [22]:
p1[1:6]

1     2
2    34
3    78
4    90
5    54
dtype: int64

In [24]:
p1[:6:2]

0     1
2    34
4    90
dtype: int64

In [27]:
import numpy as np
p1.index = np.arange(30,37)
p1

30     1
31     2
32    34
33    78
34    90
35    54
36    82
dtype: int64

In [28]:
p4 = pd.Series("Python",index=[1,34,57,69])
p4

1     Python
34    Python
57    Python
69    Python
dtype: object

In [29]:
# create a series having power of index value
s = pd.Series([x**2 for x in range(11)])
s

0       0
1       1
2       4
3       9
4      16
5      25
6      36
7      49
8      64
9      81
10    100
dtype: int64

In [30]:
s1 = pd.Series(np.square(np.arange(11)))
s1

0       0
1       1
2       4
3       9
4      16
5      25
6      36
7      49
8      64
9      81
10    100
dtype: int32

In [36]:
dates = pd.date_range(start = "2021-10-5",end = "2021-10-12",closed="right")
dates

DatetimeIndex(['2021-10-06', '2021-10-07', '2021-10-08', '2021-10-09',
               '2021-10-10', '2021-10-11', '2021-10-12'],
              dtype='datetime64[ns]', freq='D')

In [32]:
help(pd.date_range)

Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D' (calendar daily)
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight befo

In [33]:
d1 = pd.date_range(start = "2020-10-5",periods=8)

In [34]:
d1

DatetimeIndex(['2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08',
               '2020-10-09', '2020-10-10', '2020-10-11', '2020-10-12'],
              dtype='datetime64[ns]', freq='D')

In [40]:
d2 = pd.date_range(start = "3/6/2021",periods=6,freq = '2M')
d2

DatetimeIndex(['2021-03-31', '2021-05-31', '2021-07-31', '2021-09-30',
               '2021-11-30', '2022-01-31'],
              dtype='datetime64[ns]', freq='2M')

In [41]:
n = np.arange(10)
p6 = pd.Series(n,index=np.arange(20,30))
p6

20    0
21    1
22    2
23    3
24    4
25    5
26    6
27    7
28    8
29    9
dtype: int32

In [42]:
p6[1:7]

21    1
22    2
23    3
24    4
25    5
26    6
dtype: int32

In [43]:
p6[[20,24,26,29]]  #fancy Indexing

20    0
24    4
26    6
29    9
dtype: int32

In [45]:
li = [13,24,35,47,68,59]
d1 = pd.DataFrame(li)
d1

Unnamed: 0,0
0,13
1,24
2,35
3,47
4,68
5,59


In [46]:
li1 = [[1,2,3],[4,5,6],[6,7,8]]
d2 = pd.DataFrame(li1)
d2

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,6,7,8


In [47]:
d2.index = [12,35,90]
d2

Unnamed: 0,0,1,2
12,1,2,3
35,4,5,6
90,6,7,8


In [48]:
d2.index = list("abc") #["a","b","c"]
d2

Unnamed: 0,0,1,2
a,1,2,3
b,4,5,6
c,6,7,8


In [49]:
d2.columns = ["x","y","z"]
d2

Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,6,7,8


In [59]:
sd = pd.Series([12,67,25,30],index = ["a","b","c","d"])
d3 = pd.DataFrame(sd,columns = [5])
d3

Unnamed: 0,5
a,12
b,67
c,25
d,30


In [61]:
empd1 = pd.DataFrame({"emp":pd.Series(["a","b","c","d"],index = [1,2,3,4]),
                     "dept":pd.Series(["ac","hr","tm","ntm"],index = [1,2,3,4])})
empd1

Unnamed: 0,emp,dept
1,a,ac
2,b,hr
3,c,tm
4,d,ntm


In [62]:
empd2 = pd.DataFrame({"emp":pd.Series(["a","b","c","d"],index = [1,2,3,4]),
                     "year":pd.Series([2000,2001,2002,2003],index=[1,2,3,4])})
empd2

Unnamed: 0,emp,year
1,a,2000
2,b,2001
3,c,2002
4,d,2003


In [63]:
pd.merge(empd1,empd2)

Unnamed: 0,emp,dept,year
0,a,ac,2000
1,b,hr,2001
2,c,tm,2002
3,d,ntm,2003


In [71]:
empd1 = pd.DataFrame({"emp":pd.Series(["a","b","c","d"],index = [1,2,3,4]),
                     "dept":pd.Series(["ac","hr","tm","ntm"],index = [1,2,3,4])})


empd2 = pd.DataFrame({"emp":pd.Series(["a","b","c","de"],index = [1,2,3,4]),
                     "year":pd.Series([2000,2001,2002,2003],index=[1,2,3,4])})
pd.merge(empd1,empd2,how = "left")

Unnamed: 0,emp,dept,year
0,a,ac,2000.0
1,b,hr,2001.0
2,c,tm,2002.0
3,d,ntm,


In [69]:
pd.merge(empd1,empd2,how = "outer") # right, inner

Unnamed: 0,emp,dept,year
0,a,ac,2000.0
1,b,hr,2001.0
2,c,tm,2002.0
3,d,ntm,
4,de,,2003.0


In [74]:
empd1 = pd.DataFrame({"emp":pd.Series(["a","b","c","d"],index = [10,2,3,4]),
                     "dept":pd.Series(["ac","hr","tm","ntm"],index = [1,2,3,4])})


empd2 = pd.DataFrame({"emp":pd.Series(["a","b","c","de"],index = [1,2,3,4]),
                     "year":pd.Series([2000,2001,2002,2003],index=[1,2,3,4])})
pd.merge(empd1,empd2,how = "outer")

Unnamed: 0,emp,dept,year
0,,ac,
1,b,hr,2001.0
2,c,tm,2002.0
3,d,ntm,
4,a,,2000.0
5,de,,2003.0


In [76]:
empd2.append(empd2)

Unnamed: 0,emp,year
1,a,2000
2,b,2001
3,c,2002
4,de,2003
1,a,2000
2,b,2001
3,c,2002
4,de,2003


In [78]:
pd.concat([empd1,empd2],axis = 1)

Unnamed: 0,emp,dept,emp.1,year
1,,ac,a,2000.0
2,b,hr,b,2001.0
3,c,tm,c,2002.0
4,d,ntm,de,2003.0
10,a,,,


In [79]:
pd.concat([empd1,empd2],axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,dept,emp,year
1,ac,,
2,hr,b,
3,tm,c,
4,ntm,d,
10,,a,
1,,a,2000.0
2,,b,2001.0
3,,c,2002.0
4,,de,2003.0
