## Pandas Library
Pandas is the most popular python library that is used for data analysis. It provides highly optimized performance with back-end source code is purely written in C or Python.

We can analyze data in pandas with:
1.Series
2.DataFrames

In [21]:
import pandas as pd
#Since we did not specify an index for the data, a default one
#consisting of the integers 0 through N - 1 (where N is the length of the
#data) is created.

#Series
obj0=pd.Series([])
print(obj0)  

print()
obj1=pd.Series([4, 7, -5, 3])
print(obj1)

print()
obj2=pd.Series([4.2, 7.8, -5, 3.2])
print(obj2)

print("\nValues:",obj1.values)
print("\nIndex:",obj1.index)

Series([], dtype: float64)

0    4
1    7
2   -5
3    3
dtype: int64

0    4.2
1    7.8
2   -5.0
3    3.2
dtype: float64

Values: [ 4  7 -5  3]

Index: RangeIndex(start=0, stop=4, step=1)


In [52]:
#We can define our own index.
obj2 = pd.Series([4, 7, -5, 3],index=['dd', 'bb', 1, 'c'])
print("Type:",type(obj2))
print()
print(obj2)
print("\nValues:",obj1.values)
print("\nIndex:",obj2.index)
print(obj2['bb'])
print()
print("obj2[obj2>0] :\n",obj2[obj2>0])    #return data for values greater then 0.
print()
print("obj2*2 :\n",obj2*2)
print()
print("obj2**2 :\n",obj2**2)

Type: <class 'pandas.core.series.Series'>

dd    4
bb    7
1    -5
c     3
dtype: int64

Values: [ 4  7 -5  3]

Index: Index(['dd', 'bb', 1, 'c'], dtype='object')
7

obj2[obj2>0] :
 dd    4
bb    7
c     3
dtype: int64

obj2*2 :
 dd     8
bb    14
1    -10
c      6
dtype: int64

obj2**2 :
 dd    16
bb    49
1     25
c      9
dtype: int64


In [63]:
#Series is as a fixed-length, ordered dict.

#1. When you are only passing a dict, the index in the resulting Series will have the dict’s keys in sorted order.
sdata={'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
print(type(sdata))
print()
obj3=pd.Series(sdata)
print(obj3) 


#2. You can override this by passing the dict keys in the order you want them to appear in the resulting Series.
states=['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
print()
print(obj4)

print()
#3. Membership tests
if 'Texas' in sdata:     
    print(True)
    
if 'California' not in sdata:
    print(False)

<class 'dict'>

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

True
False


In [69]:
#The isnull and notnull functions in pandas should be used to detect missing data.
print("obj3:\n",obj3)
print()
print("obj4:\n",obj4)
print()
print(pd.isnull(obj3))
print()
print(pd.isnull(obj4))

obj3:
 Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

obj4:
 California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

Ohio      False
Texas     False
Oregon    False
Utah      False
dtype: bool

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [87]:
#Two Series objects can be added.
print("obj3:\n",obj3)
#print(type(obj3))
print()
print("obj4:\n",obj4)
print()
print(obj3+obj4)

obj3:
 Bob      35000
Steve    71000
Jeff     16000
Ryan      5000
dtype: int64

obj4:
 state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

Bob          NaN
California   NaN
Jeff         NaN
Ohio         NaN
Oregon       NaN
Ryan         NaN
Steve        NaN
Texas        NaN
dtype: float64


In [80]:
print("obj4:\n",obj4)
print()
 
obj4.name = 'population'                #To assign name to an object
print("Name of object obj4:",obj4.name)
print("Name of object obj3:",obj3.name)
print()

obj4.index.name = 'state'
print("obj4:\n",obj4)

obj4:
 state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

Name of object obj4: population
Name of object obj3: None

obj4:
 state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [86]:
#A Series’s index can be altered in-place by assignment:
print("obj3:\n",obj3)
print()
obj3.index=['Bob','Steve','Jeff','Ryan']
print("Object with new indexes:\n",obj3)
     #the changes made to index are permanent

obj3:
 Bob      35000
Steve    71000
Jeff     16000
Ryan      5000
dtype: int64

Object with new indexes:
 Bob      35000
Steve    71000
Jeff     16000
Ryan      5000
dtype: int64


In [163]:
# Program to Create series with scalar values. 
Data =[1, 3, 4, 5, 6, 2, 9] # Numeric data 

# Creating series with default index values 
s = pd.Series(Data)

# predefined index values 
Index =['a', 'b', 'c', 'd', 'e', 'f', 'g'] 

# Creating series with predefined index values 
si = pd.Series(Data, Index) 
print(si)

a    1
b    3
c    4
d    5
e    6
f    2
g    9
dtype: int64


In [161]:
#Creating series using dictionary.
dict={1:'one',2:'two',3:'three',4:'four'}
data=pd.Series(dict)
print(data)

1      one
2      two
3    three
4     four
dtype: object


In [166]:
#Program to Create ndarray series.
nd_array=[[1,2,3],[4,5,6]]
nd_data=pd.Series(nd_array)
print(nd_data)

0    [1, 2, 3]
1    [4, 5, 6]
dtype: object


In [104]:
#Data Frame: A DataFrame represents a rectangular table of data and contains an ordered collection of columns.
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year':[2000, 2001, 2002, 2001, 2002, 2003],'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]} 

frame = pd.DataFrame(data)
print(type(frame))
print("\nframe:\n",frame)

#If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order.
frame1=pd.DataFrame(data,columns=['year','state','pop'])
print("\nframe1:\n",frame1)

#If you pass a column that isn’t contained in the dict, it will appear with missing values in the result.
frame2=pd.DataFrame(data,columns=['year','state','pop','debt'])
print("\nframe2:\n",frame2)

print("\nframe.head() :\n",frame.head())   #the head() method selects only the first five rows.
print("\nframe.tail() :\n",frame.tail())   #the tail() method selects only the last five rows.

<class 'pandas.core.frame.DataFrame'>

frame:
     state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2

frame1:
    year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2

frame2:
    year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN

frame.head() :
     state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9

frame.tail() :
     state  year  pop
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [115]:
#A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute.
print(frame2.state)         #Attribute
print("\n",frame2['state']) #Dict-like notation

#Columns can be modified by assignment.
frame2['debt']=8
print("\nframe2:\n",frame2)

#frame2['debt'] = np.arange(8.)    #ValueError: Length of values does not match length of index
frame2['debt'] = np.arange(6.)
print("\nframe2:\n",frame2)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

 0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

frame2:
    year   state  pop  debt
0  2000    Ohio  1.5     8
1  2001    Ohio  1.7     8
2  2002    Ohio  3.6     8
3  2001  Nevada  2.4     8
4  2002  Nevada  2.9     8
5  2003  Nevada  3.2     8

frame2:
    year   state  pop  debt
0  2000    Ohio  1.5   0.0
1  2001    Ohio  1.7   1.0
2  2002    Ohio  3.6   2.0
3  2001  Nevada  2.4   3.0
4  2002  Nevada  2.9   4.0
5  2003  Nevada  3.2   5.0


In [136]:
#Assignment in a DataFrame.
val=pd.Series([-1.2,-1.5,-1.7], index=[2,4,5])
frame2['debt'] = val
print("frame2:\n",frame2)

frame3=pd.DataFrame(data,index=['one','two','three','four','five','six'],columns=['state','year','pop','debt'])
print("\nframe3:\n",frame3)

frame2:
    year   state  pop  debt  eastern
0  2000    Ohio  1.5   NaN     True
1  2001    Ohio  1.7   NaN     True
2  2002    Ohio  3.6  -1.2     True
3  2001  Nevada  2.4   NaN    False
4  2002  Nevada  2.9  -1.5    False
5  2003  Nevada  3.2  -1.7    False

frame3:
         state  year  pop debt
one      Ohio  2000  1.5  NaN
two      Ohio  2001  1.7  NaN
three    Ohio  2002  3.6  NaN
four   Nevada  2001  2.4  NaN
five   Nevada  2002  2.9  NaN
six    Nevada  2003  3.2  NaN


In [150]:
#Add a new column of boolean values where the state column equals 'Ohio':

#frame3.eastern = frame3['state'] =='Ohio'        # Pandas doesn't allow columns to be created via a new attribute name.
frame2['eastern'] = frame2['state'] =='Ohio'
print("frame2:\n",frame2)

frame3['eastern'] = frame3['year']>2001
print("\nframe3:\n",frame3)

#Deleting a column
del frame2['eastern']
#del frame2.debt            AttributeError: debt
print("\nframe2:\n",frame2)

print("\nFrame 1 columns:",frame1.columns)
print("\nFrame 2 columns:",frame2.columns)
print("\nFrame 3 columns:",frame3.columns)

frame2:
    year   state  pop  debt  eastern
0  2000    Ohio  1.5   NaN     True
1  2001    Ohio  1.7   NaN     True
2  2002    Ohio  3.6  -1.2     True
3  2001  Nevada  2.4   NaN    False
4  2002  Nevada  2.9  -1.5    False
5  2003  Nevada  3.2  -1.7    False

frame3:
         state  year  pop debt  eastern
one      Ohio  2000  1.5  NaN    False
two      Ohio  2001  1.7  NaN    False
three    Ohio  2002  3.6  NaN     True
four   Nevada  2001  2.4  NaN    False
five   Nevada  2002  2.9  NaN     True
six    Nevada  2003  3.2  NaN     True

frame2:
    year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6  -1.2
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9  -1.5
5  2003  Nevada  3.2  -1.7

Frame 1 columns: Index(['year', 'state', 'pop'], dtype='object')

Frame 2 columns: Index(['year', 'state', 'pop', 'debt'], dtype='object')

Frame 3 columns: Index(['state', 'year', 'pop', 'debt', 'eastern'], dtype='object')


In [155]:
#Transpose: You can transpose the DataFrame(swap rows and columns). 
print("frame 2:\n",frame2)
print("\nTransposed frame 2:\n",frame2.T)

frame 2:
    year   state  pop  debt
0  2000    Ohio  1.5   NaN
1  2001    Ohio  1.7   NaN
2  2002    Ohio  3.6  -1.2
3  2001  Nevada  2.4   NaN
4  2002  Nevada  2.9  -1.5
5  2003  Nevada  3.2  -1.7

Transposed frame 2:
           0     1     2       3       4       5
year   2000  2001  2002    2001    2002    2003
state  Ohio  Ohio  Ohio  Nevada  Nevada  Nevada
pop     1.5   1.7   3.6     2.4     2.9     3.2
debt    NaN   NaN  -1.2     NaN    -1.5    -1.7


## To Create DataFrame :
import pandas as pd   # Import Library 
  
a = pd.DataFrame(Data)  # Create DataFrame with Data 
Here, Data can be:

1.One or more dictionaries
2.One or more Series
3.2D-numpy Ndarray

In [15]:
# 1.Program to Create Data Frame with two dictionaries.

import pandas as pd
dict1 ={'a':1, 'b':2, 'c':3, 'd':4,'e':7,'f':11}  # Define Dictionary 1 
dict2 ={'a':5, 'b':6, 'c':7, 'd':8, 'e':9}        # Define Dictionary 2 
Data = {'first':dict1,'second':dict2}             # Define Data with dict1 and dict2 

my_series = pd.Series(Data)             # Create Series 
print("Series:\n",my_series)

df=pd.DataFrame(Data)                   # Create DataFrame
print("\nDataFrame:\n",df)
print(type(df))

Series:
 first     {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 7, 'f': 11}
second             {'a': 5, 'b': 6, 'c': 7, 'd': 8, 'e': 9}
dtype: object

DataFrame:
    first  second
a      1     5.0
b      2     6.0
c      3     7.0
d      4     8.0
e      7     9.0
f     11     NaN
<class 'pandas.core.frame.DataFrame'>


In [21]:
#2. Program to create Dataframe of three series.

import pandas as pd 
s1 = pd.Series([1, 3, 4, 5, 6, 2, 9])		   # Define series 1 
s2 = pd.Series([1.1, 3.5, 4.7, 5.8, 2.9, 9.3]) # Define series 2 
s3 = pd.Series(['a', 'b', 'c', 'd', 'e'])	   # Define series 3 

Data ={'first':s1, 'second':s2, 'third':s3}  # Define Data 

dfseries = pd.DataFrame(Data)			    # Create DataFrame 
print("DataFrame:\n",dfseries)

my_series = pd.Series(Data)			       # Create DataFrame 
print("\nSeries:\n",my_series)

DataFrame:
    first  second third
0      1     1.1     a
1      3     3.5     b
2      4     4.7     c
3      5     5.8     d
4      6     2.9     e
5      2     9.3   NaN
6      9     NaN   NaN

Series:
 first     0    1
1    3
2    4
3    5
4    6
5    2
6   ...
second    0    1.1
1    3.5
2    4.7
3    5.8
4    2.9
5...
third      0    a
1    b
2    c
3    d
4    e
dtype: object
dtype: object


In [25]:
#3. Program to create DataFrame from 2D array. 
import pandas as pd       # Import Library

d1 =[[2, 3, 4],[5, 6, 7]] # Define 2d array 1 
d2 =[[2, 4, 8],[1, 3, 9]] # Define 2d array 2 
Data ={'first': d1, 'second': d2} # Define Data 

df2d = pd.DataFrame(Data) # Create DataFrame 
print("DataFrame:\n",df2d)

my_series= pd.Series(Data) # Create Series
print("\nSeries:\n",my_series)

DataFrame:
        first     second
0  [2, 3, 4]  [2, 4, 8]
1  [5, 6, 7]  [1, 3, 9]

Series:
 first     [[2, 3, 4], [5, 6, 7]]
second    [[2, 4, 8], [1, 3, 9]]
dtype: object


In [31]:
#4. Program to create DataFrame from numpy array. 
import numpy as np
d1 =np.array([2, 3, 4,5, 6, 7]) # Define array 1 
d2 =np.array([2, 4, 8,1, 3, 9]) # Define array 2 
Data ={'first': d1, 'second': d2} # Define Data 

df2d = pd.DataFrame(Data,index=[1,2,3,4,5,6]) # Create DataFrame 
print("DataFrame:\n",df2d)

my_series= pd.Series(Data) # Create Series
print("\nSeries:\n",my_series)

DataFrame:
    first  second
1      2       2
2      3       4
3      4       8
4      5       1
5      6       3
6      7       9

Series:
 first     [2, 3, 4, 5, 6, 7]
second    [2, 4, 8, 1, 3, 9]
dtype: object


## There are two ways to store text data in Pandas:
1. Object dtype
2. String dtype

*String dtype is recommended to store text data.

In [43]:
import pandas as pd
s=pd.Series(['a','b','c'])
print(s)
#print("\n",pd.Series(['a', 'b', 'c'], dtype="string"))
#print("\n",pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()))
#print(s.astype("String"))

0    a
1    b
2    c
dtype: object
