In [None]:
# Pandas contains high level data structures and manipulation tools to make data analysis fast and easy in Python.

In [1]:
import pandas as pd
from pandas import Series,DataFrame # Series and Data Frame are two data structures available in python

In [None]:
# Series
# Series is a one-dimensional array like object containing an array of data(any Numpy data type,
# and an associated array of data labels, called its index.

In [2]:
mjp=Series([5,4,3,2,1])
print(mjp) # A series is represented by index on the left and values on the right
print(mjp.values) # similar to dictionary. ".values" command returns values in a series 

0    5
1    4
2    3
3    2
4    1
dtype: int64
[5 4 3 2 1]


In [4]:
print(mjp.index) # returns the index values of the series

RangeIndex(start=0, stop=5, step=1)


In [5]:
jeeva = Series([5,4,3,2,1,-7,-29], index =['a','b','c','d','e','f','h']) # The index is specified
print(jeeva.values)
print(jeeva.index)

[  5   4   3   2   1  -7 -29]
Index(['a', 'b', 'c', 'd', 'e', 'f', 'h'], dtype='object')


In [6]:
print(jeeva)
print(jeeva['a'])

a     5
b     4
c     3
d     2
e     1
f    -7
h   -29
dtype: int64
5


In [10]:
jeeva['d']=10
print(jeeva)
print(jeeva[['a','b','e']])

a     5
b     4
c     3
d    10
e     1
f    -7
h   -29
dtype: int64
a    5
b    4
e    1
dtype: int64


In [11]:
print(jeeva[jeeva>0])
print(jeeva*2)

a     5
b     4
c     3
d    10
e     1
dtype: int64
a    10
b     8
c     6
d    20
e     2
f   -14
h   -58
dtype: int64


In [12]:
import numpy as np
np.mean(jeeva) # you can apply numpy functions to a Series

-1.8571428571428572

In [13]:
print('a' in jeeva) # checks whether the index is present in Series or not

True


In [14]:
player_salary ={'Rooney': 50000, 'Messi': 75000, 'Ronaldo': 85000, 'Fabregas':40000, 'Van persie': 67000}
new_ply=Series(player_salary) # converting a dictionary to a series
print(new_ply) # the series has keys of a dictionary

Fabregas      40000
Messi         75000
Ronaldo       85000
Rooney        50000
Van persie    67000
dtype: int64


In [15]:
players =['Klose', 'Messi', 'Ronaldo', 'Van persie', 'Ballack'] 
ply1=Series(player_salary,index=players)
print(ply1) # Since, no value was not found for Klose and Ballack, it appears as NAN

Klose             NaN
Messi         75000.0
Ronaldo       85000.0
Van persie    67000.0
Ballack           NaN
dtype: float64


In [16]:
pd.isnull(ply1) #checks for Null values in player_1, pd denotes a pandas dataframe

Klose          True
Messi         False
Ronaldo       False
Van persie    False
Ballack        True
dtype: bool

In [17]:
pd.notnull(ply1) # Checks for null values that are not Null

Klose         False
Messi          True
Ronaldo        True
Van persie     True
Ballack       False
dtype: bool

In [18]:
ply1.name='Footballers' # name for the Series
ply1.index.name='Player Names' #name of the index
ply1

Player Names
Klose             NaN
Messi         75000.0
Ronaldo       85000.0
Van persie    67000.0
Ballack           NaN
Name: Footballers, dtype: float64

In [19]:
ply1.index =['Neymar', 'Hulk', 'Pirlo', 'Buffon', 'Anderson'] # is used to alter the index of Series
ply1

Neymar          NaN
Hulk        75000.0
Pirlo       85000.0
Buffon      67000.0
Anderson        NaN
Name: Footballers, dtype: float64

In [None]:
# Data Frame
# Data frame is a spread sheet like structure, containing ordered collection of columns.
# Each column can have different value type. Data frame has both row index and column index.

In [20]:
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india=DataFrame(states)
india

Unnamed: 0,Language,Population,State
0,Gujarati,36,Gujarat
1,Tamil,44,Tamil Nadu
2,Telugu,67,Andhra
3,Kannada,89,Karnataka
4,Malayalam,34,Kerala


In [21]:
DataFrame(states,columns=['State','Language','Population'])

Unnamed: 0,State,Language,Population
0,Gujarat,Gujarati,36
1,Tamil Nadu,Tamil,44
2,Andhra,Telugu,67
3,Karnataka,Kannada,89
4,Kerala,Malayalam,34


In [22]:
new_farme = DataFrame(states, columns=['State', 'Language', 'Population', 'Per Capita Income'], index =['a','b','c','d','e'])

In [23]:
print(new_farme.columns)
print(new_farme['State'])

Index(['State', 'Language', 'Population', 'Per Capita Income'], dtype='object')
a       Gujarat
b    Tamil Nadu
c        Andhra
d     Karnataka
e        Kerala
Name: State, dtype: object


In [24]:
new_farme.Population

a    36
b    44
c    67
d    89
e    34
Name: Population, dtype: int64

In [25]:
new_farme.ix[3]

State                Karnataka
Language               Kannada
Population                  89
Per Capita Income          NaN
Name: d, dtype: object

In [26]:
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,
c,Andhra,Telugu,67,
d,Karnataka,Kannada,89,
e,Kerala,Malayalam,34,


In [27]:
new_farme['Per Capita Income']=99
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,99
b,Tamil Nadu,Tamil,44,99
c,Andhra,Telugu,67,99
d,Karnataka,Kannada,89,99
e,Kerala,Malayalam,34,99


In [28]:
new_farme['Per Capita Income']=np.arange(5)
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,0
b,Tamil Nadu,Tamil,44,1
c,Andhra,Telugu,67,2
d,Karnataka,Kannada,89,3
e,Kerala,Malayalam,34,4


In [29]:
series=Series([22,33,44],index=['b','c','d'])
new_farme['Per Capita Income']=series
new_farme

Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,22.0
c,Andhra,Telugu,67,33.0
d,Karnataka,Kannada,89,44.0
e,Kerala,Malayalam,34,


In [30]:
new_farme['Develop']=new_farme.State=='Gujarat'
print(new_farme)
del(new_farme['Develop'])
new_farme

        State   Language  Population  Per Capita Income Develop
a     Gujarat   Gujarati          36                NaN    True
b  Tamil Nadu      Tamil          44               22.0   False
c      Andhra     Telugu          67               33.0   False
d   Karnataka    Kannada          89               44.0   False
e      Kerala  Malayalam          34                NaN   False


Unnamed: 0,State,Language,Population,Per Capita Income
a,Gujarat,Gujarati,36,
b,Tamil Nadu,Tamil,44,22.0
c,Andhra,Telugu,67,33.0
d,Karnataka,Kannada,89,44.0
e,Kerala,Malayalam,34,


In [31]:
new_data ={'Modi': {2010: 72, 2012: 78, 2014 : 98},'Rahul': {2010: 55, 2012: 34, 2014: 22}}
elections=DataFrame(new_data)
print(elections)
elections.T

      Modi  Rahul
2010    72     55
2012    78     34
2014    98     22


Unnamed: 0,2010,2012,2014
Modi,72,78,98
Rahul,55,34,22


In [32]:
DataFrame(new_data,index=[2012,2014,2016])

Unnamed: 0,Modi,Rahul
2012,78.0,34.0
2014,98.0,22.0
2016,,


In [33]:
ex= {'Gujarat':elections['Modi'][:-1], 'India': elections['Rahul'][:2]}
px =DataFrame(ex)
px

Unnamed: 0,Gujarat,India
2010,72,55
2012,78,34


In [34]:
px.index.name='year'
px.columns.name='poltitican'
px

poltitican,Gujarat,India
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,72,55
2012,78,34


In [35]:
px.values

array([[72, 55],
       [78, 34]])

In [36]:
jeeva = Series([5,4,3,2,1,-7,-29], index =['a','b','c','d','e','f','h'])
index=jeeva.index
print(index)
print(index[1:])
index['a']='x' # you cannot modify an index element. It will generate an error. In other words, they are immutable

Index(['a', 'b', 'c', 'd', 'e', 'f', 'h'], dtype='object')
Index(['b', 'c', 'd', 'e', 'f', 'h'], dtype='object')


TypeError: Index does not support mutable operations

In [37]:
print(px)
2013 in px.index

poltitican  Gujarat  India
year                      
2010             72     55
2012             78     34


False

In [None]:
# Reindex

In [38]:
var = Series(['Python', 'Java', 'c', 'c++', 'Php'], index =[5,4,3,2,1])
print(var)
var1=var.reindex([1,2,3,4,5])
print(var1)

5    Python
4      Java
3         c
2       c++
1       Php
dtype: object
1       Php
2       c++
3         c
4      Java
5    Python
dtype: object


In [39]:
var.reindex([1,2,3,4,5,6,7])

1       Php
2       c++
3         c
4      Java
5    Python
6       NaN
7       NaN
dtype: object

In [40]:
var.reindex([1,2,3,4,5,6,7],fill_value=1)

1       Php
2       c++
3         c
4      Java
5    Python
6         1
7         1
dtype: object

In [41]:
gh =Series(['Dhoni', 'Sachin', 'Kohli'], index =[0,2,4])
print(gh)
gh.reindex(range(6),method='ffill')

0     Dhoni
2    Sachin
4     Kohli
dtype: object


0     Dhoni
1     Dhoni
2    Sachin
3    Sachin
4     Kohli
5     Kohli
dtype: object

In [42]:
gh.reindex(range(6),method="bfill")

0     Dhoni
1    Sachin
2    Sachin
3     Kohli
4     Kohli
5       NaN
dtype: object

In [43]:
import numpy as np
fp = DataFrame(np.arange(9).reshape((3,3)),index =['a','b','c'], columns =['Gujarat','Tamil Nadu', 'Kerala'])
fp

Unnamed: 0,Gujarat,Tamil Nadu,Kerala
a,0,1,2
b,3,4,5
c,6,7,8


In [44]:
fp1=fp.reindex(['a', 'b', 'c', 'd'], columns = states)
fp1

Unnamed: 0,State,Population,Language
a,,,
b,,,
c,,,
d,,,


In [45]:
er = Series(np.arange(5), index =['a','b','c','d','e'])
print(er)
er.drop(['a','b'])

a    0
b    1
c    2
d    3
e    4
dtype: int64


c    2
d    3
e    4
dtype: int64

In [46]:
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = DataFrame(states, columns =['State', 'Population', 'Language'])
print(india)
india.drop([0,1])

        State  Population   Language
0     Gujarat          36   Gujarati
1  Tamil Nadu          44      Tamil
2      Andhra          67     Telugu
3   Karnataka          89    Kannada
4      Kerala          34  Malayalam


Unnamed: 0,State,Population,Language
2,Andhra,67,Telugu
3,Karnataka,89,Kannada
4,Kerala,34,Malayalam


In [47]:
india.drop(['Population','State'],axis=1)

Unnamed: 0,Language
0,Gujarati
1,Tamil
2,Telugu
3,Kannada
4,Malayalam


In [49]:
#Selection, Indexing and Filtering

In [50]:
var = Series(['Python', 'Java', 'c', 'c++', 'Php'], index =[5,4,3,2,1])
var

5    Python
4      Java
3         c
2       c++
1       Php
dtype: object

In [51]:
print(var[5])
print(var[2:4])
print(var[var=='Python'])

Python
3      c
2    c++
dtype: object
5    Python
dtype: object


In [52]:
var[[3,2,1]]

3      c
2    c++
1    Php
dtype: object

In [53]:
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = DataFrame(states, columns =['State', 'Population', 'Language'])
india

Unnamed: 0,State,Population,Language
0,Gujarat,36,Gujarati
1,Tamil Nadu,44,Tamil
2,Andhra,67,Telugu
3,Karnataka,89,Kannada
4,Kerala,34,Malayalam


In [54]:
india[['Population','State']]

Unnamed: 0,Population,State
0,36,Gujarat
1,44,Tamil Nadu
2,67,Andhra
3,89,Karnataka
4,34,Kerala


In [55]:
india[india['Population']>50]

Unnamed: 0,State,Population,Language
2,Andhra,67,Telugu
3,Karnataka,89,Kannada


In [56]:
india[:3] # first three rows

Unnamed: 0,State,Population,Language
0,Gujarat,36,Gujarati
1,Tamil Nadu,44,Tamil
2,Andhra,67,Telugu


In [57]:
import pandas as pd
states ={'State' :['Gujarat', 'Tamil Nadu', ' Andhra', 'Karnataka', 'Kerala'],
                  'Population': [36, 44, 67,89,34],
                  'Language' :['Gujarati', 'Tamil', 'Telugu', 'Kannada', 'Malayalam']}
india = DataFrame(states, columns =['State', 'Population', 'Language'], index =['a', 'b', 'c', 'd', 'e'])
india

Unnamed: 0,State,Population,Language
a,Gujarat,36,Gujarati
b,Tamil Nadu,44,Tamil
c,Andhra,67,Telugu
d,Karnataka,89,Kannada
e,Kerala,34,Malayalam


In [58]:
india.ix[['a','b'],['State','Population']]  # this is how you select subset of rows

Unnamed: 0,State,Population
a,Gujarat,36
b,Tamil Nadu,44
