In [64]:
import pandas as pd
import numpy as np

def add_noise_to_data() -> pd.DataFrame:
    raw_data = pd.read_csv('csv_data/DailyDelhiClimate.csv')

    #wrong data types
    wrong_data_types = pd.DataFrame(['a', 'b', 'c', 'd', 'e'], index=raw_data.columns).T

    #wrong data values
    wrong_data_values = pd.DataFrame([9999, -999, -1000, 1000, -550], index=raw_data.columns).T

    #duplicates
    duplicates = raw_data.iloc[:10, :]

    #missing values
    missing_values = pd.DataFrame([np.nan, np.nan, np.nan, np.nan, np.nan], index=raw_data.columns).T

    noisy_data = pd.concat([raw_data, wrong_data_types, wrong_data_values, duplicates, missing_values])
    noisy_data.reset_index(inplace=True, drop=True)
    noisy_data = noisy_data.reindex(np.random.permutation(noisy_data.index))
    noisy_data.reset_index(inplace=True, drop=True)
    noisy_data.to_pickle('csv_data/RawDailyDelhiClimate.pkl')

add_noise_to_data()

# Pandas: Data Preparation
In this section we're going to cover different data pre processing techniques to clean the data for data science purposes.
For the first exercise take a look at `dropna()` and `drop_duplicates()` in the Pandas library.

In [65]:
#Data Ingestion

raw_data = pd.read_pickle('csv_data/RawDailyDelhiClimate.pkl')
raw_data

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-04-03,30.5,29.75,6.9375,1004.25
1,2017-01-13,13.235294,67.058824,6.435294,1017.529412
2,2017-01-14,13.2,74.28,5.276,1018.84
3,2017-03-28,29.888889,40.666667,8.844444,1009.0
4,2017-04-19,33.5,24.125,9.025,1000.875
...,...,...,...,...,...
122,2017-02-28,23.333333,51.666667,3.911111,1013.111111
123,2017-01-03,17.111111,81.888889,4.016667,1018.333333
124,2017-02-27,19.875,58.375,5.1,1014.25
125,2017-02-01,15.25,78.625,5.1,1017.5


Exercise 1: Basic cleaning
1. What kind of data inconsistencies can you spot?
2. Delete missing values from the data set.
3. Delete duplicate values from the data set.

In [66]:
#Write your solutions for exercise 1 here.

#1. There are missing values, duplicates, wrong data types, and wrong data values

#2
clean_data = raw_data.dropna().copy(deep=True)

#3.
clean_data.drop_duplicates(inplace=True)
clean_data

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-04-03,30.5,29.75,6.9375,1004.25
1,2017-01-13,13.235294,67.058824,6.435294,1017.529412
2,2017-01-14,13.2,74.28,5.276,1018.84
3,2017-03-28,29.888889,40.666667,8.844444,1009.0
4,2017-04-19,33.5,24.125,9.025,1000.875
...,...,...,...,...,...
120,2017-04-18,34.0,27.333333,7.811111,1003.111111
122,2017-02-28,23.333333,51.666667,3.911111,1013.111111
124,2017-02-27,19.875,58.375,5.1,1014.25
125,2017-02-01,15.25,78.625,5.1,1017.5


Exercise 2: Intermediate cleaning

1. Write a function that includes the cleaning steps from the previous exercise, that takes a data frame as input and returns a cleaner data frame.
2. Solve the other data inconsistencies in the data set and include this solution in the function you just wrote.
3. Write the data as a csv named 'Clean{YOUR NAME}DailyDelhiClimate'.

In [67]:
#Write your solutions for exercise 2 here.

#1. 
def clean_data_func_1(raw_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_data = raw_df.dropna()#.copy(deep=True)
    cleaned_data.drop_duplicates(inplace=True)
    return cleaned_data

#2. 
def clean_data_func_2(raw_df: pd.DataFrame) -> pd.DataFrame:
    cleaned_data = raw_df.dropna()#.copy(deep=True)
    cleaned_data.drop_duplicates(inplace=True)

    #clean wrong data types
    wrong_data_types = cleaned_data['meantemp'].apply(lambda x: type(x) != str)         #note that if you want to do it thoroughly you have to do this for every column. But the noise data is created as entire rows, so this works
    cleaned_data = cleaned_data.loc[wrong_data_types]

    #clean wrong data values
    wrong_data_values = cleaned_data['meantemp'] > 0
    cleaned_data = cleaned_data.loc[wrong_data_values]

    return cleaned_data

#3.
clean_data = clean_data_func_2(raw_df=raw_data)
clean_data.to_csv('CleanSjenkieDailyDelhiClimate.csv')
clean_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data.drop_duplicates(inplace=True)


Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2017-04-03,30.5,29.75,6.9375,1004.25
1,2017-01-13,13.235294,67.058824,6.435294,1017.529412
2,2017-01-14,13.2,74.28,5.276,1018.84
3,2017-03-28,29.888889,40.666667,8.844444,1009.0
4,2017-04-19,33.5,24.125,9.025,1000.875
...,...,...,...,...,...
120,2017-04-18,34.0,27.333333,7.811111,1003.111111
122,2017-02-28,23.333333,51.666667,3.911111,1013.111111
124,2017-02-27,19.875,58.375,5.1,1014.25
125,2017-02-01,15.25,78.625,5.1,1017.5


Exercise 3: Advanced Data quality check
We've prepared a data set that is already clean. Now we use this prepped data set a double check.

1. Read BOTH the prepped data csv file and the cleaned data set csv file that you just created. Do you notice anything different? If so, do you understand where it comes from?
2. Create a solution that solves the difference in the first question of this exercise.
3. Create a function that checks if the two data sets are the same or not.
4. Create a nice weather function that adds a new column to the existing data set.
5. Save the resulting data set.

In [99]:
#Write your solutions for exercise 3 here.

#1. When writing dataframes as a csv file and re-reading the same csv file an extra column is introduced called Unnamed:0, this column indicates the column
prepped_data = pd.read_csv('csv_data/DailyDelhiClimate.csv')
cleaned_data = pd.read_csv('CleanSjenkieDailyDelhiClimate.csv')

print(prepped_data)
print(cleaned_data)

# #2.
def solve_difference(data: pd.DataFrame) -> bool:
    data = data.sort_values('date', ignore_index=True).copy(deep=True)
    data = data.drop('Unnamed: 0', axis=1)
    return data

solved_data = solve_difference(data=cleaned_data)
print(solved_data)       # This should return an error

#3. There are several ways to solve this. I suggest two options. 
# The first one is cleaning the re-read data by deleting the extra column. 
# The second one is writing the data as a pkl file instead of csv (This one I prefer)

def check_same(made_df: pd.DataFrame, test_df: pd.DataFrame) -> float:
    made_df = solve_difference(data=made_df)
    made_df.sort_values('date', inplace=True, ignore_index=True)
    same_df = made_df == test_df
    
    n_same_values = same_df.sum().sum()
    n_values = test_df.shape[0] * test_df.shape[1]
    
    # print(made_df.loc[(made_df != test_df).values] == test_df.loc[(made_df != test_df).values])
    # print(test_df.loc[(made_df != test_df).values]['wind_speed'].iloc[0])
    # print(made_df.loc[(made_df != test_df).values]['wind_speed'].iloc[0])
    # print(n_values)
    # print(n_same_values)

    return n_same_values/n_values

#Note that the value below is not 100%, because of a rounding error uncomment the print statements in the function above to see what I mean
print(f'Data is {check_same(made_df=cleaned_data, test_df=prepped_data) * 100}% similar') 


#4. Nice weather function and add this to the data set
def nice_weather(data: pd.DataFrame, nice_temp: float, nice_humidity: float, nice_wind_speed: float, nice_pressure: float) -> pd.DataFrame:
    temp_data = data.copy(deep=True)
    temp_data['nice_weather'] = np.nan
    for i, row in temp_data.iterrows():
        if row['meantemp'] > nice_temp and row['humidity'] > nice_humidity and row['wind_speed'] < nice_wind_speed and row['meanpressure'] < nice_pressure:
            temp_data.loc[i,'nice_weather'] = True
        else:
            temp_data.loc[i,'nice_weather'] = False
    return temp_data

enriched_data = nice_weather(data=cleaned_data,
                             nice_temp=0,
                             nice_humidity=0,
                             nice_wind_speed=1000,
                             nice_pressure=1010)
print('ENRICHED DATA')
print(enriched_data)

#5.
enriched_data.to_csv('enriched_data.csv')


           date   meantemp   humidity  wind_speed  meanpressure
0    2017-01-01  15.913043  85.869565    2.743478     59.000000
1    2017-01-02  18.500000  77.222222    2.894444   1018.277778
2    2017-01-03  17.111111  81.888889    4.016667   1018.333333
3    2017-01-04  18.700000  70.050000    4.545000   1015.700000
4    2017-01-05  18.388889  74.944444    3.300000   1014.333333
..          ...        ...        ...         ...           ...
109  2017-04-20  34.500000  27.500000    5.562500    998.625000
110  2017-04-21  34.250000  39.375000    6.962500    999.875000
111  2017-04-22  32.900000  40.900000    8.890000   1001.600000
112  2017-04-23  32.875000  27.500000    9.962500   1002.125000
113  2017-04-24  32.000000  27.142857   12.157143   1004.142857

[114 rows x 5 columns]
     Unnamed: 0        date   meantemp   humidity  wind_speed  meanpressure
0             0  2017-04-03  30.500000  29.750000    6.937500   1004.250000
1             1  2017-01-13  13.235294  67.058824    6.4