In [4]:
people = {
            'first': ['corey', 'Jane', 'John'],
            'last':  ['Schafer', 'Doe', 'Doe'],
            'email': ['CoreySchafer@gmail.com', 'JaneDoe@gmail.com', 'JohnDoe@gmail.com']
}

In [5]:
import pandas as pd

In [6]:
df= pd.DataFrame(people)

In [7]:
df

Unnamed: 0,first,last,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [8]:
df.columns #Used for looking up the columns present in a dataframe

Index(['first', 'last', 'email'], dtype='object')

## Updating the column

In [9]:
df.columns= ['first_name', 'last_name', 'email']
df.columns

Index(['first_name', 'last_name', 'email'], dtype='object')

In [10]:
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [11]:
#We could also perform a general operation, ie converting all columns names to capital letter etc, on all the columns present in a dataframe by using list comprehension.
df.columns= [x.upper() for x in df.columns]
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [12]:
#We could also replace special charaters or general words among column names using the string replace function.
df.columns= df.columns.str.replace(' ', '_')
df

Unnamed: 0,FIRST_NAME,LAST_NAME,EMAIL
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [13]:
#To change the columns name to the actual characters we want to see we could use dictionary comprehension
df.rename(columns={'FIRST_NAME': 'first_name', 'LAST_NAME':'last_name', 'EMAIL': 'email'}, inplace= True) #Note the inplace command helps to effect the change in the dataframe
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


## Updating the rows in a dataframe

In [14]:
#To update the record or row of a particular index
df.loc[2]= ['John', 'Smith', 'JohnSmith@gmail.com']
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JohnSmith@gmail.com


In [15]:
#We could also indicate specific data in the row we want to update without having to right out the whole date in the row.
df.loc[2, ['last_name', 'email']]= ['Doe', 'JohnDoe@gmail.com']
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [16]:
df.loc[2, 'last_name']= 'Smith'
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JohnDoe@gmail.com


In [17]:
#We could also use the another method in the pandas library to implement this
df.at[2, 'last_name']= 'Doe'
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Doe,JohnDoe@gmail.com


In [18]:
#We could also update a single row by filtering but with the used of the loc indexer
filt= (df['email'] == 'JohnDoe@gmail.com')
df.loc[filt, 'last_name'] = 'Smith'
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,CoreySchafer@gmail.com
1,Jane,Doe,JaneDoe@gmail.com
2,John,Smith,JohnDoe@gmail.com


In [19]:
#Working on multiple columns
df['email']= df['email'].str.lower() #To convert all the emails present in the dataframe to a lowercase
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,coreyschafer@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Smith,johndoe@gmail.com


In [20]:
df['email'].apply(len) #To get the total length of each email present in the dataframe

0    22
1    17
2    17
Name: email, dtype: int64

In [21]:
#We could also perform these various operations using functions
def update_email(email):
    return email.upper()

In [24]:
df['email'].apply(update_email)

0    COREYSCHAFER@GMAIL.COM
1         JANEDOE@GMAIL.COM
2         JOHNDOE@GMAIL.COM
Name: email, dtype: object

In [25]:
#Applying this to our dataframe we would have
df['email']= df['email'].apply(update_email)
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,COREYSCHAFER@GMAIL.COM
1,Jane,Doe,JANEDOE@GMAIL.COM
2,John,Smith,JOHNDOE@GMAIL.COM


In [26]:
#We could also use the lambda functions to implement this
df['email']= df['email'].apply(lambda x: x.lower())
df

Unnamed: 0,first_name,last_name,email
0,corey,Schafer,coreyschafer@gmail.com
1,Jane,Doe,janedoe@gmail.com
2,John,Smith,johndoe@gmail.com


In [27]:
#Note when we run the apply function on a series it runs a function on all of the values in that series
#But when we run apply function on a dataframe it would run a function on each row or columns of that dataframe

In [28]:
#Running the apply len function on a series
df['email'].apply(len) #The function returns the number of characters we have in each string(email)

0    22
1    17
2    17
Name: email, dtype: int64

In [29]:
#Running the apply len function on a dataframe
df.apply(len) #The function take on a different behaviour and in this case it returns the number of elements presnt in each row

first_name    3
last_name     3
email         3
dtype: int64

In [30]:
#We could equal do this for each columns, by changing the axis
df.apply(len, axis='columns')

0    3
1    3
2    3
dtype: int64

In [None]:
#We could use the len functi0n directly on email column to get the total number of elements(emails) present
len(df['email'])

In [None]:
#Other functions we could use alongside the apply function
df.apply(pd.Series.min)

In [None]:
df.apply(lambda x: x.min())

In [None]:
#To operate on each individual element in the dataframe we could use the applymap() function
#For example, we could use the applymap(len) function to get the length of every element in the dataframe
#Also note that, the applymap() function only works on dfataframe and not on series
df.applymap(len)

In [None]:
#More operations using the applymap() function
df.applymap(str.lower)

## Map Method

In [None]:
#The map method is used for substituting each values in a series with anothe value and it only works on series.
#For example
df['first_name'].map({'corey': 'Chris', 'Jane': 'Mary'})

In [None]:
#If we seek to substitute value into the series while still maintaining other values we didn't substitute another value for ie avoid the NaN value returned by the map function.
#We could do this using the replace method
df['first_name'].replace({'corey': 'Chris', 'Jane': 'Mary'})

## Application with larger datasets

In [None]:
df = pd.read_csv('data/survey_results_public.csv', index_col='Respondent')
schema_df= pd.read_csv('data/survey_results_schema.csv', index_col='Column')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.head()

In [None]:
#Renaming a column
df.rename(columns= {'ConvertedComp': 'SalaryUSD'}, inplace = True) #The inplace command ensures that the change is affected.

In [None]:
df['SalaryUSD']

In [None]:
#We could also apply the map function to the dataframe
df['Hobbyist']

In [None]:
df['Hobbyist'].map({'Yes': 'True', 'No': 'False'})

In [None]:
df['Hobbyist']= df['Hobbyist'].map({'Yes': 'True', 'No': 'False'}) #Placing the modified values into the dataframe

In [None]:
df

In [None]:
#Note: use the replace method in cases where you seek to change some certain values in the series or column while maintaining the current values of the others.