In [None]:
people={
    "first":["Corey","Jane","John"],
    "last":["Schafer","Doe","Doe"],
    "email":["coreymschaefer@gmail.com","janedoe@yahoo.com","johndoe@hotmail.com"]
}

In [None]:
import pandas as pd
df=pd.DataFrame(people)

In [None]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com


In [None]:
people["email"]

['coreymschaefer@gmail.com', 'janedoe@yahoo.com', 'johndoe@hotmail.com']

In [None]:
df['email']

0    coreymschaefer@gmail.com
1           janedoe@yahoo.com
2         johndoe@hotmail.com
Name: email, dtype: object

In [None]:
#rows of data; Series is a single column of rows
type(df['email'])

pandas.core.series.Series

In [None]:
#dataframe is a container for multiple series objects

In [None]:
df.email

0    coreymschaefer@gmail.com
1           janedoe@yahoo.com
2         johndoe@hotmail.com
Name: email, dtype: object

In [None]:
#accessing multiple columns. We will pass a list inside the list
#Since this has multiple columns now, we can no longer call this a Series
df[['last','email']]

Unnamed: 0,last,email
0,Schafer,coreymschaefer@gmail.com
1,Doe,janedoe@yahoo.com
2,Doe,johndoe@hotmail.com


In [None]:
#to see all of columns
df.columns

Index(['first', 'last', 'email'], dtype='object')

In [None]:
#iloc is integer location
#Series containing the first row of data
df.iloc[0]

first                       Corey
last                      Schafer
email    coreymschaefer@gmail.com
Name: 0, dtype: object

In [None]:
#1st and 2nd row
df.iloc[[0,1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com


In [None]:
#specifying the columns we want
df.iloc[[0,1],2]

0    coreymschaefer@gmail.com
1           janedoe@yahoo.com
Name: email, dtype: object

In [None]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com


In [None]:
df.loc[0]

first                       Corey
last                      Schafer
email    coreymschaefer@gmail.com
Name: 0, dtype: object

In [None]:
df.loc[[0,1]]

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com


In [None]:
#there is a difference between loc and iloc
df.loc[[0,1],'email']

0    coreymschaefer@gmail.com
1           janedoe@yahoo.com
Name: email, dtype: object

In [None]:
df.loc[[0,1],['email','last']]

Unnamed: 0,email,last
0,coreymschaefer@gmail.com,Schafer
1,janedoe@yahoo.com,Doe


In [None]:
#setting a specific column as index values instead of automated 0,1,...n
df.set_index('email')

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
coreymschaefer@gmail.com,Corey,Schafer
janedoe@yahoo.com,Jane,Doe
johndoe@hotmail.com,John,Doe


In [None]:
#dataframe is not actually changed
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com


In [None]:
#we want to actually set our index and have that change carry over to the actual dataframe
df.set_index('email',inplace=True)

In [None]:
df

Unnamed: 0_level_0,first,last
email,Unnamed: 1_level_1,Unnamed: 2_level_1
coreymschaefer@gmail.com,Corey,Schafer
janedoe@yahoo.com,Jane,Doe
johndoe@hotmail.com,John,Doe


In [None]:
df.index

Index(['coreymschaefer@gmail.com', 'janedoe@yahoo.com', 'johndoe@hotmail.com'], dtype='object', name='email')

In [None]:
df.loc['janedoe@yahoo.com']

first    Jane
last      Doe
Name: janedoe@yahoo.com, dtype: object

In [None]:
#what if i want a specific column's value of a specific index
df.loc['janedoe@yahoo.com','last']

'Doe'

In [None]:
#df.loc[0] will not work anymore as we do not have the default row index anymore
df.iloc[0]

first      Corey
last     Schafer
Name: coreymschaefer@gmail.com, dtype: object

In [None]:
#we can bring back our default index
df.reset_index(inplace=True)

In [None]:
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
#we want dataframe with lastname as 'Doe'
df['last']=='Doe'

0    False
1     True
2     True
Name: last, dtype: bool

In [None]:
filt= (df['last']=='Doe')

In [None]:
df[filt]

Unnamed: 0,email,first,last
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
#df.loc[...] is used to look up rows and cols by labels
#we can also passa set of booleans to filter data out
df.loc[filt]

Unnamed: 0,email,first,last
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.loc[filt,'email']

1      janedoe@yahoo.com
2    johndoe@hotmail.com
Name: email, dtype: object

In [None]:
filt=(df['last']=='Doe') & (df['first']=='John')
df.loc[filt,'email']

2    johndoe@hotmail.com
Name: email, dtype: object

In [None]:
filt=(df['last']=='Schafer') | (df['first']=='John')
df.loc[filt,'email']

0    coreymschaefer@gmail.com
2         johndoe@hotmail.com
Name: email, dtype: object

In [None]:
#we do not want rows that has lastname Schafer and firstname as john
df.loc[~filt,'email']

1    janedoe@yahoo.com
Name: email, dtype: object

In [None]:
df.columns

Index(['email', 'first', 'last'], dtype='object')

In [None]:
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.columns=['email','first_name','last_name']

In [None]:
df

Unnamed: 0,email,first_name,last_name
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
#UpperCase Comprehension
df.columns = [x.upper() for x in df.columns]
df

Unnamed: 0,EMAIL,FIRST_NAME,LAST_NAME
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.columns = df.columns.str.replace('_',' ')
df

Unnamed: 0,EMAIL,FIRST NAME,LAST NAME
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.columns = df.columns.str.replace(' ','_')
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,email,first_name,last_name
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.rename(columns={"first_name":"first","last_name":"last"})
df

Unnamed: 0,email,first_name,last_name
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
df.rename(columns={"first_name":"first","last_name":"last"},inplace=True)
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@hotmail.com,John,Doe


In [None]:
#relpacing John's last name with Smith
df.loc[2]

email    johndoe@hotmail.com
first                   John
last                     Doe
Name: 2, dtype: object

In [None]:
df.loc[2]=["johnsmith@email.com",'John',"Smith"]

In [None]:
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johnsmith@email.com,John,Smith


In [None]:
#what if there are too many columns
df.loc[2,['last','email']]
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johnsmith@email.com,John,Smith


In [None]:
df.loc[2,['last','email']]=['Green','johngreen@email.com']
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johngreen@email.com,John,Green


In [None]:
#Another attribute for changing a single column value
df.at[2,'last']='Doe'
df.at[2,'email']='johndoe@email.com'
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@email.com,John,Doe


In [None]:
df['email'].str.upper()

0    COREYMSCHAEFER@GMAIL.COM
1           JANEDOE@YAHOO.COM
2           JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [None]:
df['email']=df['email'].str.upper()
df

Unnamed: 0,email,first,last
0,COREYMSCHAEFER@GMAIL.COM,Corey,Schafer
1,JANEDOE@YAHOO.COM,Jane,Doe
2,JOHNDOE@EMAIL.COM,John,Doe


In [None]:
df['email']=df['email'].str.lower()
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@email.com,John,Doe


In [None]:
#apply,applymap,map,replace
#Apply
#want to see the length of all our email IDs
df['email'].apply(len)

0    24
1    17
2    17
Name: email, dtype: int64

In [None]:
def update_email(email):
  return email.upper()

In [None]:
df['email'].apply(update_email)

0    COREYMSCHAEFER@GMAIL.COM
1           JANEDOE@YAHOO.COM
2           JOHNDOE@EMAIL.COM
Name: email, dtype: object

In [None]:
df['email']=df['email'].apply(update_email)
df

Unnamed: 0,email,first,last
0,COREYMSCHAEFER@GMAIL.COM,Corey,Schafer
1,JANEDOE@YAHOO.COM,Jane,Doe
2,JOHNDOE@EMAIL.COM,John,Doe


In [None]:
df['email']=df['email'].apply(lambda x: x.lower())
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@email.com,John,Doe


In [None]:
df['email'].apply(len)

0    24
1    17
2    17
Name: email, dtype: int64

In [None]:
df.apply(len) #number of rows in each column

email    3
first    3
last     3
dtype: int64

In [None]:
len(df['email'])

3

In [None]:
df.apply(len,axis='columns')

0    3
1    3
2    3
dtype: int64

In [None]:
df.apply(pd.Series.min)

email    coreymschaefer@gmail.com
first                       Corey
last                          Doe
dtype: object

In [None]:
df.apply(lambda x:x.min())

email    coreymschaefer@gmail.com
first                       Corey
last                          Doe
dtype: object

In [None]:
#applymap only works on dataframes
#unlike apply which works on both Series and DataFrames
#applymap runs apply function to every individual element in our dataframe
df.applymap(len)

Unnamed: 0,email,first,last
0,24,5,7
1,17,4,3
2,17,4,3


In [None]:
df.applymap(str.lower)

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,corey,schafer
1,janedoe@yahoo.com,jane,doe
2,johndoe@email.com,john,doe


In [None]:
#map
df['first'].map({"Corey":"Chris","Jane":"Mary"})
#the ones that are not replaced in the 'first' column
#will be NaN

0    Chris
1     Mary
2      NaN
Name: first, dtype: object

In [None]:
df

Unnamed: 0,email,first,last
0,coreymschaefer@gmail.com,Corey,Schafer
1,janedoe@yahoo.com,Jane,Doe
2,johndoe@email.com,John,Doe


In [None]:
df['first'].replace({"Corey":"Chris","Jane":"Mary"})

0    Chris
1     Mary
2     John
Name: first, dtype: object

In [None]:
df['first']+' '+df['last']

0    Corey Schafer
1         Jane Doe
2         John Doe
dtype: object

In [None]:
df['full_name'] = df['first']+' '+df['last']
df

Unnamed: 0,email,first,last,full_name
0,coreymschaefer@gmail.com,Corey,Schafer,Corey Schafer
1,janedoe@yahoo.com,Jane,Doe,Jane Doe
2,johndoe@email.com,John,Doe,John Doe


In [None]:
df.drop(columns=['first','last'],inplace=True)

In [None]:
 df['full_name'].str.split(' ',expand=True)

Unnamed: 0,0,1
0,Corey,Schafer
1,Jane,Doe
2,John,Doe


In [None]:
 df[['first','last']]=df['full_name'].str.split(' ',expand=True)

In [None]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe


In [None]:
#add a single row
df.append({'first':'Tony'},ignore_index=True)

  df.append({'first':'Tony'},ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,,,Tony,


In [None]:
people={
    "first":["Tony","Steve"],
    "last":["Stark","Rogers"],
    "email":["IronMan@avenge.com","Cap@avenge.com"]
}
df2=pd.DataFrame(people)
df2

Unnamed: 0,first,last,email
0,Tony,Stark,IronMan@avenge.com
1,Steve,Rogers,Cap@avenge.com


In [None]:
#add df2 to df
df.append(df2,ignore_index=True)

  df.append(df2,ignore_index=True)


Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [None]:
df.append(df2,ignore_index=True,sort=False)

  df.append(df2,ignore_index=True,sort=False)


Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [None]:
#we have no inplace in this
df=df.append(df2,ignore_index=True,sort=False)

  df=df.append(df2,ignore_index=True,sort=False)


In [None]:
df

Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [None]:
#remove rows
df.drop(index=4)

Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
1,janedoe@yahoo.com,Jane Doe,Jane,Doe
2,johndoe@email.com,John Doe,John,Doe
3,IronMan@avenge.com,,Tony,Stark


In [None]:
#remove whose last name is Doe
filt= df['last']=='Doe'
df.drop(index = df[filt].index)

Unnamed: 0,email,full_name,first,last
0,coreymschaefer@gmail.com,Corey Schafer,Corey,Schafer
3,IronMan@avenge.com,,Tony,Stark
4,Cap@avenge.com,,Steve,Rogers


In [None]:
people={
    "first":["Corey","Jane","John","Adam"],
    "last":["Schafer","Doe","Doe","Doe"],
    "email":["coreymschaefer@gmail.com","janedoe@yahoo.com","johndoe@hotmail.com","A@email.com"]
}
df=pd.DataFrame(people)

In [None]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com
3,Adam,Doe,A@email.com


In [None]:
df.sort_values(by='last')

Unnamed: 0,first,last,email
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com
3,Adam,Doe,A@email.com
0,Corey,Schafer,coreymschaefer@gmail.com


In [None]:
df.sort_values(by='last',ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com
3,Adam,Doe,A@email.com


In [None]:
df.sort_values(by=['last','first'],ascending=False)

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
2,John,Doe,johndoe@hotmail.com
1,Jane,Doe,janedoe@yahoo.com
3,Adam,Doe,A@email.com


In [None]:
df.sort_values(by=['last','first'],ascending=[False,True])

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
3,Adam,Doe,A@email.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com


In [None]:
df.sort_values(by=['last','first'],ascending=[False,True],inplace=True)

In [None]:
df

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
3,Adam,Doe,A@email.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com


In [None]:
df.sort_index()

Unnamed: 0,first,last,email
0,Corey,Schafer,coreymschaefer@gmail.com
1,Jane,Doe,janedoe@yahoo.com
2,John,Doe,johndoe@hotmail.com
3,Adam,Doe,A@email.com


In [None]:
df['last'].sort_values()

3        Doe
1        Doe
2        Doe
0    Schafer
Name: last, dtype: object

In [4]:
import numpy as np
import pandas as pd

In [5]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [6]:
df=pd.DataFrame(people)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [7]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [10]:
#default arguments
#since it is set to index, it looks at rows. We can change it to cols
#drop rows with any missing values
df.dropna(axis='index',how='any')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
6,,Missing,,Missing


In [11]:
df.dropna(axis='index',how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [12]:
df.dropna(axis='columns',how='all')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [13]:
df.dropna(axis='columns',how='any')

0
1
2
3
4
5
6


In [14]:
#we want rows that atleast their email IDs filled in
df.dropna(axis='index',how='any',subset=['email'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [16]:
#email or last should be there
df.dropna(axis='index',how='all',subset=['email','last'])

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,,36
5,,,Anonymous@email.com,
6,,Missing,,Missing


In [18]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}

In [20]:
df=pd.DataFrame(people)

df.replace('NA',np.nan,inplace=True)
df.replace('Missing',np.nan,inplace=True)
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [21]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [22]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [23]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Chris,Schafer,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,Anonymous@email.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [24]:
#casting datatypes
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [25]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [28]:
#df['age'].mean() will throw an error bcs age is object(string) datatype
#we cannot typecast age to int bcs we have NaN and NaN is treated as float
type(np.nan)

float

In [29]:
df['age'] = df['age'].astype(float)

In [30]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,CoreyMSchafer@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [31]:
type(df['age'])

pandas.core.series.Series

In [32]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [33]:
df['age'].mean()

46.75

In [34]:
#df.astype()