In [15]:
import pandas as pd
import numpy as np

def add_noise_to_data() -> pd.DataFrame:
    raw_data = pd.read_csv('csv_data/DailyDelhiClimate.csv')

    #wrong data types
    wrong_data_types = pd.DataFrame(['a', 'b', 'c', 'd', 'e'], index=raw_data.columns).T

    #wrong data values
    wrong_data_values = pd.DataFrame([9999, -999, -1000, 1000, -550], index=raw_data.columns).T

    #duplicates
    duplicates = raw_data.iloc[:10, :]

    #missing values
    missing_values = pd.DataFrame([np.nan, np.nan, np.nan, np.nan, np.nan], index=raw_data.columns).T

    noisy_data = pd.concat([raw_data, wrong_data_types, wrong_data_values, duplicates, missing_values])
    noisy_data.reset_index(inplace=True, drop=True)
    noisy_data = noisy_data.reindex(np.random.permutation(noisy_data.index))
    noisy_data.reset_index(inplace=True, drop=True)
    noisy_data.to_pickle('csv_data/RawDailyDelhiClimate.pkl')

add_noise_to_data()

# Pandas: Data Preparation
In this section we're going to cover different data pre processing techniques to clean the data for data science purposes.

In [16]:
#Data Ingestion

raw_data = pd.read_pickle('csv_data/RawDailyDelhiClimate.pkl')
raw_data

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-01-28,14.863636,82.772727,9.690909,1019.636364
1,2017-02-01,15.25,78.625,5.1,1017.5
2,2017-01-05,18.388889,74.944444,3.3,1014.333333
3,2017-01-03,17.111111,81.888889,4.016667,1018.333333
4,2017-03-18,24.692308,46.307692,7.123077,1009.846154
...,...,...,...,...,...
122,2017-02-11,15.625,64.0,3.95,1016.625
123,2017-01-10,12.111111,71.944444,9.361111,1016.888889
124,2017-03-07,22.625,41.5,6.025,1007.375
125,2017-04-13,29.666667,29.111111,4.944444,1006.777778


Exercise 1: Basic cleaning
1. What kind of data inconsistencies can you spot?
2. Delete missing values from the data set
3. Delete duplicate values from the data set

In [18]:
#Write your solutions for exercise 1 here.

#1. There are missing values, duplicates, wrong data types, and wrong data values

#2
clean_data = raw_data.dropna().copy(deep=True)

#3.
clean_data.drop_duplicates(inplace=True)
clean_data

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-01-28,14.863636,82.772727,9.690909,1019.636364
1,2017-02-01,15.25,78.625,5.1,1017.5
2,2017-01-05,18.388889,74.944444,3.3,1014.333333
3,2017-01-03,17.111111,81.888889,4.016667,1018.333333
4,2017-03-18,24.692308,46.307692,7.123077,1009.846154
...,...,...,...,...,...
121,2017-02-07,15.125,63.75,7.6375,1016.125
122,2017-02-11,15.625,64.0,3.95,1016.625
124,2017-03-07,22.625,41.5,6.025,1007.375
125,2017-04-13,29.666667,29.111111,4.944444,1006.777778


Exercise 2: Intermediate cleaning

1. Write a function that includes the cleaning steps from the previous exercise, that takes a data frame as input and returns a cleaner data frame
2. Solve the other data inconsistencies in the data set and include this solution in the function you just wrote
3. Write the data as a csv named 'Clean{YOUR NAME}DailyDelhiClimate'

In [20]:
#Write your solutions for exercise 2 here.

#1. 
def clean_data_func_1(raw_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_data = raw_df.dropna().copy(deep=True)
    cleaned_data.drop_duplicates(inplace=True)
    return cleaned_data

#2. 
def clean_data_func_2(raw_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_data = raw_df.dropna().copy(deep=True)
    cleaned_data.drop_duplicates(inplace=True)

    #clean wrong data types
    wrong_data_types = cleaned_data['meantemp'].apply(lambda x: type(x) != str)         #note that if you want to do it thoroughly you have to do this for every column. But the noise data is created as entire rows, so this works
    cleaned_data = cleaned_data.loc[wrong_data_types]

    #clean wrong data values
    wrong_data_values = cleaned_data['meantemp'] > 0
    cleaned_data = cleaned_data.loc[wrong_data_values]

    return cleaned_data

#3.
clean_data = clean_data_func_2(raw_df=raw_data)
clean_data.to_csv('CleanSjenkieDailyDelhiClimate.csv')
clean_data

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-01-28,14.863636,82.772727,9.690909,1019.636364
1,2017-02-01,15.25,78.625,5.1,1017.5
2,2017-01-05,18.388889,74.944444,3.3,1014.333333
3,2017-01-03,17.111111,81.888889,4.016667,1018.333333
4,2017-03-18,24.692308,46.307692,7.123077,1009.846154
...,...,...,...,...,...
121,2017-02-07,15.125,63.75,7.6375,1016.125
122,2017-02-11,15.625,64.0,3.95,1016.625
124,2017-03-07,22.625,41.5,6.025,1007.375
125,2017-04-13,29.666667,29.111111,4.944444,1006.777778


Exercise 3: Advanced Data quality check
We've prepared a data set that is already clean. Now we use this prepped data set a double check

1. Read BOTH the prepped data csv file and the cleaned data set csv file that you just created. Do you notice anything different?
2. Create a function that checks if the two data sets are the same or not
3. Create a solution that solves the difference in the first question of this exercise
4. Save the cleaned data set and incorporating the answer of question 3 of this exercise.

In [None]:
#Write your solutions for exercise 2 here.

In [3]:
raw_data = pd.read_csv('csv_data/DailyDelhiClimate.csv')
raw_data


Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-01-01,15.913043,85.869565,2.743478,59.000000
1,2017-01-02,18.500000,77.222222,2.894444,1018.277778
2,2017-01-03,17.111111,81.888889,4.016667,1018.333333
3,2017-01-04,18.700000,70.050000,4.545000,1015.700000
4,2017-01-05,18.388889,74.944444,3.300000,1014.333333
...,...,...,...,...,...
109,2017-04-20,34.500000,27.500000,5.562500,998.625000
110,2017-04-21,34.250000,39.375000,6.962500,999.875000
111,2017-04-22,32.900000,40.900000,8.890000,1001.600000
112,2017-04-23,32.875000,27.500000,9.962500,1002.125000


# Exercise 1.
