In [1]:
#trying some concepts in pandas

# this stores data only for one person
person = {
    "first": "Andy", 
    "last": "Zed", 
    "email": "andyz@gmail.com"
}


In [2]:
# inorder to represent data for multiple people
# just make the values in the dictionary a list

people = {
    "first": ["Andy"], 
    "last": ["Zed"], 
    "email": ["andyz@gmail.com"]

}

In [3]:
# so we add up some values and the second value is the second person

people = {
    "first": ["Andy", 'Jane', 'John'], 
    "last": ["Zed", 'Doe', 'Doe'], 
    "email": ["andyz@gmail.com", 'JaneDoe@email.com', 'JohnDoe@email.com']
}

In [4]:
# so lets try to access email , that is straight forward
people['email']

['andyz@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']

In [5]:
# for furthur functionality lets use dataframes and pandas
import pandas as pd

#condtruct the dataframe
df = pd.DataFrame(people)

In [6]:
df

Unnamed: 0,first,last,email
0,Andy,Zed,andyz@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [7]:
# to access a single value from the dataframe's COLUMNS we use the same principle as dictionary
# accessing a single column
df['first']

0    Andy
1    Jane
2    John
Name: first, dtype: object

In [8]:
#lets check the type
type(df['first'])


#pandas.core.series.Series
#Series can be thought as rows of a single data with single column but
#dataframes are datas with multiple rows and columns 
#or dataframe is a container of multiple this series """


pandas.core.series.Series

In [9]:
# accessing multiple column
# just pass the a list containing the columns

df[['email', 'first']]

Unnamed: 0,email,first
0,andyz@gmail.com,Andy
1,JaneDoe@email.com,Jane
2,JohnDoe@email.com,John


In [10]:
# if we have a lot of columns 
# can be seen easyly
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [11]:
# to get the rows we use
# loc and iloc

# iloc allows us to access rows by integer location

# the first row is which is a single row 

df.iloc[0]

first               Andy
last                 Zed
email    andyz@gmail.com
Name: 0, dtype: object

In [12]:
# to access multiple row pass lists of the rows
df.iloc[[0, 1]]

Unnamed: 0,first,last,email
0,Andy,Zed,andyz@gmail.com
1,Jane,Doe,JaneDoe@email.com


In [13]:
# to get columns for certain row we can pass another index as a second argument to iloc
# for example 'email' is the third column that has index of 2
# iloc always takes integer so pass integer



df.iloc[[0, 1], 2]

0      andyz@gmail.com
1    JaneDoe@email.com
Name: email, dtype: object

In [14]:
# to access a row by lable we use loc
# for this dataframe the lables are the indexes but can be changed to strings
# and columns can be accessed by their names

df.loc[[2, 0], ['email', 'last']]

Unnamed: 0,email,last
2,JohnDoe@email.com,Doe
0,andyz@gmail.com,Zed


In [15]:
df

Unnamed: 0,first,last,email
0,Andy,Zed,andyz@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [16]:
# what if we want to set another identifier for the rows other than the index given by default
# to set the email as an index to the dataframe

df.set_index('email')

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
andyz@gmail.com,Andy,Zed
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [17]:
# but the dataframe is not changed 
df

Unnamed: 0,first,last,email
0,Andy,Zed,andyz@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [18]:
# to change it in the way we want that is to make email an index
# doing it might be useful as it is a unique value to search for other info
# like using it in loc[] as a lable

df.set_index('email', inplace=True)

In [19]:
df        # now it is changed and can be reseted to original after manipulation

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
andyz@gmail.com,Andy,Zed
JaneDoe@email.com,Jane,Doe
JohnDoe@email.com,John,Doe


In [20]:
# lets check the index
df.index

Index(['andyz@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com'], dtype='object', name='email')

In [21]:
df.columns  # no email as it is now an index

Index(['first', 'last'], dtype='object')

In [22]:
df.loc['andyz@gmail.com']

first    Andy
last      Zed
Name: andyz@gmail.com, dtype: object

In [23]:
# to reset the dataframe
df.reset_index(inplace=True)

In [24]:
df   # back to original

Unnamed: 0,email,first,last
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [25]:
# lets apply some filters on the dataframe
# get the all whose last name ia Doe
# first create a filter
# then pass it to the dataframe

filt = (df['last'] == 'Doe')

In [26]:
df[filt]

Unnamed: 0,email,first,last
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [27]:
# but the better is to use the loc[] function
# the same output but we can assign another argument in it like 'email'

df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [28]:
# to use and and or operators her we have to use the signs instead
# & and |

filt = (df['last'] == 'Doe') & (df['first'] == 'John') 


In [29]:
df.loc[filt, 'email']

2    JohnDoe@email.com
Name: email, dtype: object

In [30]:
filt = (df['last'] == 'Doe') | (df['first'] == 'John')

In [31]:
df.loc[filt, 'email']

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

In [32]:
# so to negate the filter
# means every thing exept the filter

df.loc[-filt, 'email']

0    andyz@gmail.com
Name: email, dtype: object

In [33]:
df

Unnamed: 0,email,first,last
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [34]:
df.columns

Index(['email', 'first', 'last'], dtype='object')

In [35]:
# lets alter some coumns
# to change the name of the coulumns

df.columns = ['email', 'first_name', 'last_name']

In [36]:
df

Unnamed: 0,email,first_name,last_name
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [37]:
# to make all the names of the columns uppercase
# use list comprehension

df.columns = [i.upper() for i in df.columns]

In [38]:
df

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [39]:
# to replace some signs in the name 
# like replacing the dashes with space

df.columns = df.columns.str.replace('_', ' ')

In [40]:
df

Unnamed: 0,EMAIL,FIRST NAME,LAST NAME
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [41]:
# how to change specific column
# use dictionary and rename method

df.rename(columns={'EMAIL': 'email', 'FIRST NAME': 'first', 'LAST NAME': 'last'}, inplace=True)

In [42]:
df

Unnamed: 0,email,first,last
0,andyz@gmail.com,Andy,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [43]:
# lets change value of the rows 

df.loc[0, ['email', 'first']] = ['bra@email.com', 'brah']

In [44]:
df

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [45]:
# to change the the case of the values


df['first'].str.upper()

0    BRAH
1    JANE
2    JOHN
Name: first, dtype: object

In [46]:
df  # but the data is not changed

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,Jane,Doe
2,JohnDoe@email.com,John,Doe


In [47]:
# to make the changes

df['first'] = df['first'].str.lower()

In [48]:
df

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,jane,Doe
2,JohnDoe@email.com,john,Doe


In [49]:
# how apply() works in series
# will take a function for single column


df['email'].apply(len)

0    13
1    17
2    17
Name: email, dtype: int64

In [50]:
# also we can pass our own functions to apply method

def revr(last):
    new = ''
    for letter in last:
        new = letter + new
    return new    

In [51]:
# no calling the function just pass it

df['last'].apply(revr)

# to make changes to the dataframe 
# df['last'] = df['last'].apply(revr)


0    deZ
1    eoD
2    eoD
Name: last, dtype: object

In [52]:
df

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,jane,Doe
2,JohnDoe@email.com,john,Doe


In [53]:
# use apply on dataframes

# apply() on data frames applys a function on all series 
# but apply() on series applys a function on all of the values in the series

df.apply(len)      # returns the how many elements in each series or columns


email    3
first    3
last     3
dtype: int64

In [54]:
# to count along the rows
df.apply(len, axis='columns')

0    3
1    3
2    3
dtype: int64

In [55]:
# for one column
len(df['email'])

3

In [56]:
# to find the min in each column in all the data frame
# as this is not numerical it will give us first alphabets in each column

#

df.apply(pd.Series.min)

email    JaneDoe@email.com
first                 brah
last                   Doe
dtype: object

In [58]:
df

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,jane,Doe
2,JohnDoe@email.com,john,Doe


In [57]:
# applymap() only works on dataframes 
# applys  a function on every individuals in the dataframe
# series objects has no applymap() method

df.applymap(len)

Unnamed: 0,email,first,last
0,13,4,3
1,17,4,3
2,17,4,3


In [61]:
# make all the values a lowercase

df.applymap(str.lower)

Unnamed: 0,email,first,last
0,bra@email.com,brah,zed
1,janedoe@email.com,jane,doe
2,johndoe@email.com,john,doe


In [75]:
# map() method is used on a series objects 
# used to substitute an element with another value

df['first'].map({'brah':'trevor','jane':'fluffy'})

0    trevor
1    fluffy
2       NaN
Name: first, dtype: object

In [79]:
df

Unnamed: 0,email,first,last
0,bra@email.com,brah,Zed
1,JaneDoe@email.com,jane,Doe
2,JohnDoe@email.com,john,Doe


In [80]:
# but the dataframe is not changed
# to do that 

df['first'] = df['first'].map({'brah':'trevor','jane':'fluffy'})

In [81]:
df

Unnamed: 0,email,first,last
0,bra@email.com,trevor,Zed
1,JaneDoe@email.com,fluffy,Doe
2,JohnDoe@email.com,,Doe
