In [1]:
import numpy as np
import pandas as pd

In [2]:
sr = pd.Series(['A', np.nan, 7])
pd.isnull(sr)

0    False
1     True
2    False
dtype: bool

In [3]:
pd.isnull(sr).any()

True

In [4]:
pd.isnull(sr).all()

False

In [5]:
sr[sr.notnull()]

0    A
2    7
dtype: object

In [6]:
sr.fillna(0)

0    A
1    0
2    7
dtype: object

In [7]:
dfn = pd.DataFrame({"brand" : ["Ford","np.nan",np.nan], 
                    "HeadQ" : [np.nan,np.nan,"Gothenburg"]})
pd.isnull(dfn)

Unnamed: 0,brand,HeadQ
0,False,True
1,False,True
2,True,False


In [8]:
dfn.isnull()

Unnamed: 0,brand,HeadQ
0,False,True
1,False,True
2,True,False


In [9]:
dfn.isnull().sum()

brand    1
HeadQ    2
dtype: int64

In [10]:
pd.read_csv('weather_data.csv').head()

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2022,32.0,8.0,Rain
1,1/4/2022,,7.0,Sunny
2,1/5/2022,28.0,,Snow
3,1/6/2022,,9.0,Rain
4,1/7/2022,32.0,10.0,Sunny


In [11]:
df = pd.read_csv('weather_data.csv', parse_dates=["day"])
df

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,,7.0,Sunny
2,2022-01-05,28.0,,Snow
3,2022-01-06,,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,,,
6,2022-01-09,,7.0,Sunny
7,2022-01-10,,,
8,2022-01-11,25.0,,Sunny


__dropna__

In [12]:
df.dropna()

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
4,2022-01-07,32.0,10.0,Sunny


In [13]:
df.dropna().count()

day            2
temperature    2
windspeed      2
event          2
dtype: int64

In [14]:
df.dropna(how="all")

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,,7.0,Sunny
2,2022-01-05,28.0,,Snow
3,2022-01-06,,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,,,
6,2022-01-09,,7.0,Sunny
7,2022-01-10,,,
8,2022-01-11,25.0,,Sunny


In [15]:
df.dropna(thresh=2)

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,,7.0,Sunny
2,2022-01-05,28.0,,Snow
3,2022-01-06,,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
6,2022-01-09,,7.0,Sunny
8,2022-01-11,25.0,,Sunny


__fillna__

In [16]:
df.fillna(0)

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,0.0,7.0,Sunny
2,2022-01-05,28.0,0.0,Snow
3,2022-01-06,0.0,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,0.0,0.0,0
6,2022-01-09,0.0,7.0,Sunny
7,2022-01-10,0.0,0.0,0
8,2022-01-11,25.0,0.0,Sunny


In [17]:
df.fillna({'temperature' : 0, 
           'windspeed' : 0, 
           'event' : 'no event'})

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,0.0,7.0,Sunny
2,2022-01-05,28.0,0.0,Snow
3,2022-01-06,0.0,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,0.0,0.0,no event
6,2022-01-09,0.0,7.0,Sunny
7,2022-01-10,0.0,0.0,no event
8,2022-01-11,25.0,0.0,Sunny


In [18]:
df.fillna(method="ffill")

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,32.0,7.0,Sunny
2,2022-01-05,28.0,7.0,Snow
3,2022-01-06,28.0,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,32.0,10.0,Sunny
6,2022-01-09,32.0,7.0,Sunny
7,2022-01-10,32.0,7.0,Sunny
8,2022-01-11,25.0,7.0,Sunny


In [19]:
df.fillna(method="ffill", limit = 1)

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,32.0,7.0,Sunny
2,2022-01-05,28.0,7.0,Snow
3,2022-01-06,28.0,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,32.0,10.0,Sunny
6,2022-01-09,,7.0,Sunny
7,2022-01-10,,7.0,Sunny
8,2022-01-11,25.0,,Sunny


In [20]:
df.fillna(method="ffill", axis= 1).head()

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,2022-01-04 00:00:00,7.0,Sunny
2,2022-01-05,28.0,28.0,Snow
3,2022-01-06,2022-01-06 00:00:00,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny


In [21]:
df.fillna(method="bfill").head()

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,28.0,7.0,Sunny
2,2022-01-05,28.0,9.0,Snow
3,2022-01-06,32.0,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny


In [22]:
df.set_index('day').interpolate()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,8.0,Rain
2022-01-04,30.0,7.0,Sunny
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,9.0,Rain
2022-01-07,32.0,10.0,Sunny
2022-01-08,30.25,8.5,
2022-01-09,28.5,7.0,Sunny
2022-01-10,26.75,7.0,
2022-01-11,25.0,7.0,Sunny


In [23]:
df.set_index('day').interpolate(method='time').head()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,8.0,Rain
2022-01-04,29.0,7.0,Sunny
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,9.0,Rain
2022-01-07,32.0,10.0,Sunny


__SimpleImputer__

In [24]:
from sklearn.impute import SimpleImputer

In [25]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,,7.0,Sunny
2,2022-01-05,28.0,,Snow
3,2022-01-06,,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,,,
6,2022-01-09,,7.0,Sunny
7,2022-01-10,,,
8,2022-01-11,25.0,,Sunny


In [26]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df.iloc[:, 1:3] = imp.fit_transform(df.iloc[:, 1:3])
df

Unnamed: 0,day,temperature,windspeed,event
0,2022-01-01,32.0,8.0,Rain
1,2022-01-04,29.25,7.0,Sunny
2,2022-01-05,28.0,8.2,Snow
3,2022-01-06,29.25,9.0,Rain
4,2022-01-07,32.0,10.0,Sunny
5,2022-01-08,29.25,8.2,
6,2022-01-09,29.25,7.0,Sunny
7,2022-01-10,29.25,8.2,
8,2022-01-11,25.0,8.2,Sunny


__Replace__

In [27]:
df2 = pd.DataFrame({ 'sex' : ['M','F','F','D','?'],
                     'age' : [29,33,24,290,39]})

df2

Unnamed: 0,sex,age
0,M,29
1,F,33
2,F,24
3,D,290
4,?,39


In [28]:
df2['sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [29]:
df2.replace(['D','?'],['F','F'], inplace=True)
df2

Unnamed: 0,sex,age
0,M,29
1,F,33
2,F,24
3,F,290
4,F,39


In [30]:
df2.loc[df2['age']>100, 'age']

3    290
Name: age, dtype: int64

In [31]:
df2.loc[df2['age']>100, 'age'] = df2.loc[df2['age']>100, 'age']/10
df2

Unnamed: 0,sex,age
0,M,29
1,F,33
2,F,24
3,F,29
4,F,39


In [32]:
grade = {'roll': [30,31,32,33,34],
         'score': ['fair', 'good', 'good','average','excellent']
        }
df3 = pd.DataFrame(grade)
df3

Unnamed: 0,roll,score
0,30,fair
1,31,good
2,32,good
3,33,average
4,34,excellent


In [33]:
df3.replace(['excellent','good','average','fair','poor'],[5,4,3,2,1])

Unnamed: 0,roll,score
0,30,2
1,31,4
2,32,4
3,33,3
4,34,5


__duplicate__

In [34]:
df4 = pd.DataFrame({'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie']})
df4.duplicated()

0    False
1     True
2    False
3     True
4     True
dtype: bool

In [35]:
df4 = pd.DataFrame({'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
                    'style': ['cup', 'cup', 'cup', 'pack', 'pack']})

df4.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [36]:
df4.duplicated(subset=['brand'])

0    False
1     True
2    False
3     True
4     True
dtype: bool

In [37]:
df4 = pd.DataFrame({'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie']})
df4.duplicated(keep='last')

0     True
1    False
2     True
3     True
4    False
dtype: bool

__Splitting Column__

In [38]:
df5 = pd.DataFrame({'Data': ['1987_M_US _1','1990?_M_UK_1','1992_F_US_2','1970?_M_   IT_1','1985_F_I  T_2']})
df5

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [39]:
df5['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [40]:
df5['Data'].str.split('_').str.get(1)

0    M
1    M
2    F
3    M
4    F
Name: Data, dtype: object

In [41]:
df5['Data'].str.split('_', expand=True)   #return a DataFrame

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [42]:
df5 = df5['Data'].str.split('_', expand=True)
df5

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [43]:
df5.columns = ['Year', 'Sex', 'Country', 'No Children']
df5

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [44]:
df5['Year'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [45]:
df5['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [46]:
df5['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

__RegEx Replace__

In [47]:
s1 = pd.Series(['Mouse2', 'dog', 'house and parrot', '23'])
s1

0              Mouse2
1                 dog
2    house and parrot
3                  23
dtype: object

In [48]:
s1.str.contains('og', case=True, regex=True)

0    False
1     True
2    False
3    False
dtype: bool

In [49]:
s1.str.contains('house|dog', regex=True)

0    False
1     True
2     True
3    False
dtype: bool

In [50]:
import re
s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)

0    False
1    False
2     True
3    False
dtype: bool

In [51]:
s1.str.contains('\d', regex=True)

0     True
1    False
2    False
3     True
dtype: bool

In [52]:
s1.str.contains('\s', regex=True)  #contains a white space character

0    False
1    False
2     True
3    False
dtype: bool

In [53]:
df6 = pd.DataFrame({'day': ['1/1/2016','1/4/2016','1/5/2016','1/6/2016','1/7/2016'],
                    'temp': ['32 F',np.nan,'28 C',np.nan,'32'],
                    'windspeed': ['8 mph','7 mph',np.nan,9,np.nan],
                    'event': ['Rain', 'Sunny', 'Snow',np.nan,'Sunny']})
df6

Unnamed: 0,day,temp,windspeed,event
0,1/1/2016,32 F,8 mph,Rain
1,1/4/2016,,7 mph,Sunny
2,1/5/2016,28 C,,Snow
3,1/6/2016,,9,
4,1/7/2016,32,,Sunny


In [54]:
df6.replace('[A-Za-z]','', regex=True)

Unnamed: 0,day,temp,windspeed,event
0,1/1/2016,32.0,8.0,
1,1/4/2016,,7.0,
2,1/5/2016,28.0,,
3,1/6/2016,,9.0,
4,1/7/2016,32.0,,


In [55]:
df6.replace({'temperature': '[A-Za-z]', 'windspeed': '[A-Za-z]'},'', regex=True)

Unnamed: 0,day,temp,windspeed,event
0,1/1/2016,32 F,8.0,Rain
1,1/4/2016,,7.0,Sunny
2,1/5/2016,28 C,,Snow
3,1/6/2016,,9.0,
4,1/7/2016,32,,Sunny


__Handling Date and Time__

In [56]:
df = pd.DataFrame(np.array(['2022-01-21 00:22:00','2021-11-21 02:30:00','2022-03-11 23:01:00']), columns = ['Date'])
df

Unnamed: 0,Date
0,2022-01-21 00:22:00
1,2021-11-21 02:30:00
2,2022-03-11 23:01:00


In [57]:
df['Date'] = pd.to_datetime(df['Date'])

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 152.0 bytes


In [59]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['month_name'] = df['Date'].dt.month_name()
df['month_quarter'] = df['Date'].dt.quarter
df['day'] = df['Date'].dt.day
df['day_name'] = df['Date'].dt.day_name()
df['day_of_week'] = df['Date'].dt.dayofweek
df['day_of_year'] = df['Date'].dt.dayofyear
df['day_is_weekend'] =  np.where(df['day_name'].isin(['Friday', 'Saturday']), 1, 0)
df['time_elapsed'] = pd.to_datetime("today") - df['Date']
df['time_elapsed_days'] = (pd.to_datetime("today") - df['Date']).dt.days

In [60]:
df

Unnamed: 0,Date,year,month,month_name,month_quarter,day,day_name,day_of_week,day_of_year,day_is_weekend,time_elapsed,time_elapsed_days
0,2022-01-21 00:22:00,2022,1,January,1,21,Friday,4,21,1,311 days 14:54:02.344065,311
1,2021-11-21 02:30:00,2021,11,November,4,21,Sunday,6,325,0,372 days 12:46:02.344065,372
2,2022-03-11 23:01:00,2022,3,March,1,11,Friday,4,70,1,261 days 16:15:02.344065,261


In [61]:
df['time'] = df['Date'].dt.time
df['hour'] = df['Date'].dt.hour
df['minute'] = df['Date'].dt.minute
df['second'] = df['Date'].dt.second

In [62]:
df.iloc[:, -4:]

Unnamed: 0,time,hour,minute,second
0,00:22:00,0,22,0
1,02:30:00,2,30,0
2,23:01:00,23,1,0
