In [7]:
import pandas as pd
#Data type
df=pd.Series([1.,2.,3.,4.,5.]).astype(object)
df
"""astype() method is for specific type conversion (i.e. you can specify .astype(float64'),
.astype(float32), or .astype(float16))"""

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
dtype: object

In [8]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [1.0, 2.0, 3.0],
 'C': ['1.1.2010', '2.1.2011', '3.1.2011'],
 'D': ['1 days', '2 days', '3 days'],
 'E': ['1', '2', '3']})
df

Unnamed: 0,A,B,C,D,E
0,1,1.0,1.1.2010,1 days,1
1,2,2.0,2.1.2011,2 days,2
2,3,3.0,3.1.2011,3 days,3


In [9]:
pd.to_numeric(df['E'])

0    1
1    2
2    3
Name: E, dtype: int64

In [10]:
# Ignore the error, return the original input if it cannot be converted
pd.to_numeric(pd.Series(['1', '2', 'a']), errors='ignore')


0    1
1    2
2    a
dtype: object

In [11]:
# Return NaN when the input cannot be converted to a number
pd.to_numeric(pd.Series(['1', '2', 'a']), errors='coerce')


0    1.0
1    2.0
2    NaN
dtype: float64

In [14]:
#If need check all rows with input cannot be converted to numeric use boolean indexing with isnull:
df = pd.DataFrame({'A': [1, 'x', 'z'],
 'B': [1.0, 2.0, 3.0],
 'C': [True, False, True]})
df

Unnamed: 0,A,B,C
0,1,1.0,True
1,x,2.0,False
2,z,3.0,True


In [15]:
pd.to_numeric(df.A, errors='coerce').isnull()

0    False
1     True
2     True
Name: A, dtype: bool

In [16]:
#With include and exclude parameters you can specify which types you want
df.select_dtypes(include=['number'])

Unnamed: 0,B
0,1.0
1,2.0
2,3.0


In [17]:
df.select_dtypes(include=['number', 'bool'])

Unnamed: 0,B,C
0,1.0,True
1,2.0,False
2,3.0,True


In [21]:
df.dtypes.value_counts()

float64    1
object     1
bool       1
dtype: int64

In [22]:
#Dealing with categorical variables
df = pd.DataFrame({'Name':['John Smith', 'Mary Brown'],
 'Gender':['M', 'F'], 'Smoker':['Y', 'N']})
print(df)


         Name Gender Smoker
0  John Smith      M      Y
1  Mary Brown      F      N


In [23]:
df_with_dummies = pd.get_dummies(df, columns=['Gender', 'Smoker'])
print(df_with_dummies)

         Name  Gender_F  Gender_M  Smoker_N  Smoker_Y
0  John Smith         0         1         0         1
1  Mary Brown         1         0         1         0


In [29]:
#Duplicated data

df = pd.DataFrame({'A': ['foo', 'bar', 'baz', 'bar'],
                   'B': [1, 2, 3, 2]})
                   
mask=df.duplicated()  # returns a Boolean Series
filtered_df = df[mask]

print(filtered_df)

     A  B
3  bar  2


In [28]:
filtered_df = df[~mask]

print(filtered_df)

     A  B
0  foo  1
1  bar  2
2  baz  3


In [35]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,A,B
0,foo,1
1,bar,2
2,baz,3


In [51]:
#Counting and getting unique elements

id_numbers = pd.Series([111, 112, 112, 114, 115, 118, 114, 118, 112])
id_numbers.nunique()
id_numbers.unique()



array([111, 112, 114, 115, 118], dtype=int64)

In [52]:
df = pd.DataFrame({'Group': list('ABAABABAAB'),
'ID': [1, 1, 2, 3, 3, 2, 1, 2, 1, 3]})

df.groupby('Group')['ID'].nunique()
df.groupby('Group')['ID'].unique()

Group
A    [1, 2, 3]
B       [1, 3]
Name: ID, dtype: object

In [42]:
df = pd.DataFrame({"A":[1,1,2,3,1,1],"B":[5,4,3,4,6,7]})
df["A"].unique()


array([1, 2, 3], dtype=int64)

In [43]:
pd.unique(df['A']).tolist()

[1, 2, 3]

In [44]:
df

Unnamed: 0,A,B
0,1,5
1,1,4
2,2,3
3,3,4
4,1,6
5,1,7


In [48]:
#more complex example. Say we want to find the unique values from column 'B' where 'A'
#is equal to 1
df.loc['4', 'B'] = 4
df.loc['4', 'A'] = 1

df

Unnamed: 0,A,B
0,1.0,5.0
1,1.0,4.0
2,2.0,3.0
3,3.0,4.0
4,1.0,6.0
5,1.0,7.0
4,1.0,4.0


In [49]:
#with unique key word
pd.unique(df[df['A'] == 1 ]['B']).tolist()

[5.0, 4.0, 6.0, 7.0]

In [50]:
#without unique key word
df[df['A'] == 1]['B'].tolist()

[5.0, 4.0, 6.0, 7.0, 4.0]

In [54]:
#Getting information about DataFrames
df = pd.DataFrame({'integers': [1, 2, 3],
 'floats': [1.5, 2.5, 3],
 'text': ['a', 'b', 'c'],
 'ints with None': [1, None, 3]})
df

Unnamed: 0,integers,floats,text,ints with None
0,1,1.5,a,1.0
1,2,2.5,b,
2,3,3.0,c,3.0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   integers        3 non-null      int64  
 1   floats          3 non-null      float64
 2   text            3 non-null      object 
 3   ints with None  2 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 224.0+ bytes


In [56]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})

In [57]:
list(df)


['a', 'b', 'c']

In [59]:
#Gotchas in pandas
import numpy as np
df=pd.DataFrame({'col':[1,np.nan]})
df==np.nan
"""This is because comparing missing value to anything results in a False """

Unnamed: 0,col
0,False
1,False


In [60]:
df=pd.DataFrame({'col':[1,np.nan]})
df.isnull()

Unnamed: 0,col
0,False
1,True


In [65]:
"""Pandas don't support missing in attributes of type integer. For example if you have missings in the
grade column"""
series=pd.Series([1,2])
series



0    1
1    2
dtype: int64

In [66]:
df=pd.DataFrame(index=[3,4])
df['col']=series
df

Unnamed: 0,col
3,
4,


In [68]:
df['col']=series.values
df

Unnamed: 0,col
3,1
4,2
