# Processing Temperature Data

This notebook is used to fill the missing temperature data

## Import some libraries

In [1]:
import pandas as pd
import numpy as np
from pandas import Timestamp
from pandas.tseries.offsets import DateOffset
import unittest
from pathlib import Path

## Load the un-preprocessed data

In [None]:
states =["VIC","NSW","NT","QLD","SA","WA","TAS"]

In [2]:
weather_data = pd.read_csv('temperature_data/VIC/output_final.csv')

Also load the table containing the starting date and ending date of each station

In [3]:
selected_station=pd.read_csv('Combined_temperature_data/VIC/selected_station.csv')
#Convert the string of date to datetime type
selected_station['Selected starting date'] = pd.to_datetime(selected_station['Selected starting date'])
selected_station['Selected ending date'] = pd.to_datetime(selected_station['Selected ending date'])

### Change and modify the loaded data frame

Drop unused columns and rename some columns for convention

In [4]:
weather_data=weather_data.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

In [5]:
weather_data=weather_data.rename(columns={'Bureau of Meteorology station number':'Station Number'})

We only choose the data from selected stations list file

In [6]:
weather_data=weather_data[weather_data['Station Number'].isin(selected_station['Station Number'])]

In [7]:
weather_data

Unnamed: 0,Product code_x,Station Number,Year,Month,Day,Maximum temperature (Degree C),Days of accumulation of maximum temperature,Quality_x,Product code_y,Minimum temperature (Degree C),Days of accumulation of minimum temperature,Quality_y,stationname,localgovernmentarea,latitude,longtitude
0,IDCJAC0010,76031,1946,1,1,,,,IDCJAC0011,,,,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867
1,IDCJAC0010,76031,1946,1,2,,,,IDCJAC0011,,,,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867
2,IDCJAC0010,76031,1946,1,3,,,,IDCJAC0011,,,,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867
3,IDCJAC0010,76031,1946,1,4,,,,IDCJAC0011,,,,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867
4,IDCJAC0010,76031,1946,1,5,,,,IDCJAC0011,,,,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217508,IDCJAC0010,90186,2020,8,2,15.0,1.0,N,IDCJAC0011,5.8,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522
1217509,IDCJAC0010,90186,2020,8,3,12.9,1.0,N,IDCJAC0011,5.9,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522
1217510,IDCJAC0010,90186,2020,8,4,10.6,1.0,N,IDCJAC0011,2.5,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522
1217511,IDCJAC0010,90186,2020,8,5,11.0,1.0,N,IDCJAC0011,5.6,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522


Let see how many NaN Maximum temperature row

In [8]:
len(weather_data[np.isnan(weather_data['Minimum temperature (Degree C)'])==True])

71269

### Remove temperature data out of range

We will use the data from 2000

In [9]:
weather_data=weather_data[weather_data['Year']>=2000]

Finding how many missing temperature we need to fill (for maximum temperature data)

In [17]:
len(weather_data[np.isnan(weather_data['Maximum temperature (Degree C)'])==True])

12871

Finding how many missing temperature we need to fill (for minimum temperature data)

In [18]:
len(weather_data[np.isnan(weather_data['Minimum temperature (Degree C)'])==True])

13951

In [19]:
weather_data['Date Time']=pd.to_datetime(weather_data['Year'].astype(str)+'-'+weather_data['Month'].astype(str)+'-'+weather_data['Day'].astype(str))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Calculate the mean min and max temperature based on station and month

In [20]:
mean_max_temp=weather_data.groupby(['Station Number','Month'])['Maximum temperature (Degree C)'].mean().reset_index()
mean_min_temp=weather_data.groupby(['Station Number','Month'])['Minimum temperature (Degree C)'].mean().reset_index()

In [21]:
for index,row in selected_station.iterrows():
    station_no=row['Station Number']
    end_date = row['Selected ending date']
    weather_data=weather_data.drop(weather_data[(weather_data['Station Number']==station_no) & (pd.to_datetime(weather_data['Date Time'])>pd.to_datetime(end_date))].index)
    

## Generate missing date 

Since some stations may start recording the data after 2000 so we need to generate date to fill the gap and treat min/max temperature data as missing value

In [22]:
temp_data = pd.DataFrame()
for station_number in selected_station['Station Number']:
    # Declare output dataframe
    station_temp = weather_data[weather_data['Station Number']==station_number]
    if selected_station[selected_station['Station Number']==station_number]['Selected starting date'].values[0] < pd.to_datetime(selected_station[selected_station['Station Number']==station_number]['Starting Date recording min'].values[0]):
        #Use date_range to generate date for a given range
        idx = pd.date_range(pd.to_datetime( selected_station[selected_station['Station Number']==station_number]['Selected starting date'].values[0]),pd.to_datetime( selected_station[selected_station['Station Number']==station_number]['Selected ending date'].values[0]))
        station_temp=station_temp.set_index('Date Time').reindex(idx).reset_index().rename(columns={'index':'Date'})
        #File the missing Station number, product code,... (except max and min temperature)
        station_temp['Station Number']=station_temp['Station Number'].fillna(station_number)
        station_temp['Product code_x']=station_temp['Product code_x'].fillna(weather_data[weather_data['Station Number']==station_number]['Product code_x'].values[0])
        station_temp['stationname']=station_temp['stationname'].fillna(weather_data[weather_data['Station Number']==station_number]['stationname'].values[0])
        station_temp['latitude']=station_temp['latitude'].fillna(weather_data[weather_data['Station Number']==station_number]['latitude'].values[0])
        station_temp['longtitude']=station_temp['longtitude'].fillna(weather_data[weather_data['Station Number']==station_number]['longtitude'].values[0])
        station_temp['localgovernmentarea']=station_temp['localgovernmentarea'].fillna(weather_data[weather_data['Station Number']==station_number]['localgovernmentarea'].values[0])
    temp_data=pd.concat([temp_data,station_temp])

In [23]:
temp_data['Date Time']=temp_data['Date Time'].fillna(temp_data['Date'])
temp_data=temp_data.drop(columns=['Date'])
temp_data['Month'] = temp_data['Date Time'].dt.month

In [25]:
temp_data

Unnamed: 0,Product code_x,Station Number,Year,Month,Day,Maximum temperature (Degree C),Days of accumulation of maximum temperature,Quality_x,Product code_y,Minimum temperature (Degree C),Days of accumulation of minimum temperature,Quality_y,stationname,localgovernmentarea,latitude,longtitude,Date Time
19723,IDCJAC0010,76031.0,2000.0,1,1.0,24.9,1.0,Y,IDCJAC0011,10.7,1.0,Y,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867,2000-01-01
19724,IDCJAC0010,76031.0,2000.0,1,2.0,30.9,1.0,Y,IDCJAC0011,12.5,1.0,Y,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867,2000-01-02
19725,IDCJAC0010,76031.0,2000.0,1,3.0,34.8,1.0,Y,IDCJAC0011,17.7,1.0,Y,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867,2000-01-03
19726,IDCJAC0010,76031.0,2000.0,1,4.0,24.3,1.0,Y,IDCJAC0011,13.0,1.0,Y,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867,2000-01-04
19727,IDCJAC0010,76031.0,2000.0,1,5.0,25.6,1.0,Y,IDCJAC0011,9.7,1.0,Y,MILDURA AIRPORT,Mildura Shire,-34.2358,142.0867,2000-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1217508,IDCJAC0010,90186.0,2020.0,8,2.0,15.0,1.0,N,IDCJAC0011,5.8,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522,2020-08-02
1217509,IDCJAC0010,90186.0,2020.0,8,3.0,12.9,1.0,N,IDCJAC0011,5.9,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522,2020-08-03
1217510,IDCJAC0010,90186.0,2020.0,8,4.0,10.6,1.0,N,IDCJAC0011,2.5,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522,2020-08-04
1217511,IDCJAC0010,90186.0,2020.0,8,5.0,11.0,1.0,N,IDCJAC0011,5.6,1.0,N,WARRNAMBOOL AIRPORT NDB,Warrnambool,-38.2867,142.4522,2020-08-05


### Find the missing after generating missing date

Find out missing maximum temperature and number of missing values in maximum temperature

In [26]:
missing_max_data = temp_data[np.isnan(temp_data['Maximum temperature (Degree C)'])==True]

In [27]:
len(missing_max_data)

34060

Find out missing minimum temperature and number of missing values in minimum temperature

In [28]:
missing_min_data= temp_data[np.isnan(temp_data['Minimum temperature (Degree C)'])==True]

In [29]:
len(missing_min_data)

35140

## Fill the missing temperature value

In [10]:
def filling_missing_temp(row,col_name,mean_temp):
    """
    Fill the missing min and max temperature  (based on column name). This function will be apply to each row of the original 
    data and fill the value if there is missing data.
    
    Parameters:
    -----------
        row( Row of DataFrame): Use apply function to feed each row of the dataframe into function
        col_name (string): Column we want to check and fill the missing value (Maximum Temperature or Minimum temperature)
        mean_temp (DataFrame): The mean temperature table based on station number and month that we calculate above
    
    Returns:
    --------
        (int):temperature data for a missing temperature value
    
    """
    if isinstance(col_name,str) == False:
        raise TypeError("Column name must be string")
    ## If the current row contain missing temperature value
    if np.isnan(row[col_name]) == True:
        #Get the current stationanumber
        station_no= row['Station Number']
        date = pd.to_datetime(row['Date Time'])
        month=row['Month']
        # Find the starting date and ending date of the station
        starting_date =  pd.to_datetime(selected_station[selected_station['Station Number'] == station_no]['Selected starting date'].values[0])
        ending_date = pd.to_datetime(selected_station[selected_station['Station Number'] == station_no]['Selected ending date'].values[0])
        #if the year of the current missing date == recording starting date of the station
        if date.strftime('%Y') ==  starting_date.strftime('%Y'):
            ## We find the next year temperature value on the same day
            date_offset=date + DateOffset(years=1)
            ## while loop until we find there is a temperature value
            while date_offset <= ending_date:
                nextyear_temp = temp_data[(temp_data['Station Number']==station_no) & (temp_data["Date Time"]==date_offset)][col_name].values[0]
                if np.isnan(nextyear_temp) == False:
                    return nextyear_temp
                date_offset=date_offset + DateOffset(years=1)
            # If we cannot find temperature on the same day we fill by the mean tempeture on the same month    
            return mean_temp.loc[(mean_temp['Station Number']==station_no) & (mean_temp['Month']==month)][col_name].values[0]
        
        #if the year of the current missing date == recording ending date of the station
        elif date.strftime('%Y') ==  ending_date.strftime('%Y'):
            ## We find the previous year temperature value on the same day
            date_offset=date - DateOffset(years=1)
            ## while loop until we find there is a temperature value
            while date_offset >= starting_date:
                prevyear_temp = temp_data[(temp_data['Station Number']==station_no) & (temp_data["Date Time"]==date_offset)][col_name].values[0]
                if np.isnan(prevyear_temp) == False:
                    return prevyear_temp
                date_offset=date_offset - DateOffset(years=1)
            # If we cannot find temperature on the same day we fill by the mean tempeture on the same month  
            return mean_temp.loc[(mean_temp['Station Number']==station_no) & (mean_temp['Month']==month)][col_name].values[0]
        
        #If the year is between starting year and ending year
        else:
            #find the next year temperature value on the same day
            date_offset=date + DateOffset(years=1)
            nextyear_null=True
            nextyear_temp=-1
            while date_offset <= ending_date:
                nextyear_temp = temp_data[(temp_data['Station Number']==station_no) & (temp_data["Date Time"]==date_offset)][col_name].values[0]
                if np.isnan(nextyear_temp) == False:
                    nextyear_null=False
                    break
                date_offset=date_offset + DateOffset(years=1)
            #find the previous year temperature value on the same day
            prevyear_temp=-1
            prevyear_null=True
            date_offset=date - DateOffset(years=1)
            while date_offset >= starting_date:
                prevyear_temp = temp_data[(temp_data['Station Number']==station_no) & (temp_data["Date Time"]==date_offset)][col_name].values[0]
                if np.isnan(prevyear_temp) == False:
                    prevyear_null=False
                    break
                date_offset=date_offset - DateOffset(years=1)
            #Check if these values are null
            #if prev value and next value are null then fill the missing value by the mean temperature
            if prevyear_null== True and nextyear_null== True:
                return mean_temp.loc[(mean_temp['Station Number']==station_no) & (mean_temp['Month']==month)][col_name].values[0]
            #if one of them is not null
            elif prevyear_null ==True and nextyear_null == False:
                return nextyear_temp
            elif prevyear_null ==False and nextyear_null == True:
                return prevyear_temp
            #if both of them are not null, we take the mean of 2 values 
            else:
                retun_tem=(nextyear_temp+prevyear_temp)/2
                return retun_tem
    # if there is no missing data, return the current value
    else:
        return row[col_name]
        
    

Instead of using for loop, we can use apply function from pandas (slightly increase performance and reduce the time complexity) <br />
Each row of the data frame will be passed into filling_missing_temp to check and fill the missing data (if there is)

In [265]:
temp_data['Maximum temperature (Degree C)']=temp_data.apply(lambda row : filling_missing_temp(row,'Maximum temperature (Degree C)',mean_max_temp),axis=1)

In [266]:
temp_data['Minimum temperature (Degree C)']=temp_data.apply(lambda row : filling_missing_temp(row,'Minimum temperature (Degree C)',mean_min_temp),axis=1)

## Unit testing 

In [83]:
class ProcessingDataTesting(unittest.TestCase):
    """
    
    Testing class to check the data after preprocessing data
    
    :param data_df: A dataframe contains temperature data after adding date

    
    """
    def __init__(self,test_name,data_df,mean_min_temp,mean_max_temp):
        """
        Parameters:
        -----------
            test_name: name of each test function
            data_df: dataframe contains the temperature data and corresponding date
            mean_min_temp/mean_max_temp: dataframe contains the mean min/max temperature based on the station number and month
        """
        super(ProcessingDataTesting, self).__init__(test_name)
        self.data_df = data_df
        self.mean_min_temp = mean_min_temp
        self.mean_max_temp = mean_max_temp
        
    def test_range_of_date(self):
        """
        Check starting date of all station (make sure all station start recording from 2000
        
        """
        station_number = self.data_df["Station Number"].unique()
        starting_from_2000 = True
        error_station =[]
        for each_station in station_number:
            if pd.to_datetime(self.data_df[self.data_df["Station Number"]==each_station]["Date Time"].values[0]).strftime('%Y') != "2000":
                starting_from_2000 = False
                error_station.append(each_station)
        message="Stations: "
        for station in error_station:
            message += str(station) +" "
            
        message += " not start from 2000"
                
        
        self.assertEqual(starting_from_2000,True,message)
        
    def test_n_missing_data_after_processing(self):
        """
        Testing the filling missing function with one station
        
        """
        station_number = self.data_df["Station Number"].unique()[0]
        
        one_station_data = self.data_df[self.data_df["Station Number"]==station_number]
        one_station_data['Minimum temperature (Degree C)']=one_station_data.apply(lambda row : filling_missing_temp(row,'Minimum temperature (Degree C)',self.mean_min_temp),axis=1)
        one_station_data['Maximum temperature (Degree C)']=one_station_data.apply(lambda row : filling_missing_temp(row,'Maximum temperature (Degree C)',self.mean_max_temp),axis=1)
        
        missing_max_data = one_station_data[np.isnan(one_station_data['Maximum temperature (Degree C)'])==True]
        missing_min_data = one_station_data[np.isnan(one_station_data['Minimum temperature (Degree C)'])==True]
        
        n_min_missing_message="There are " + str(missing_min_data) + " missing min temp rows"
        n_max_missing_message="There are " + str(missing_max_data) + " missing max temp rows"
        
        self.assertEqual(len(missing_max_data),0,n_max_missing_message)
        self.assertEqual(len(missing_min_data),0,n_min_missing_message)

test_loader = unittest.TestLoader()
test_names = test_loader.getTestCaseNames(ProcessingDataTesting)
test_suite= unittest.TestSuite()
for test_name in test_names:
    test_suite.addTest(ProcessingDataTesting(test_name,temp_data,mean_min_temp,mean_max_temp))

    

In [84]:
result = unittest.TextTestRunner().run(test_suite)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
..
----------------------------------------------------------------------
Ran 2 tests in 0.823s

OK


### Write the preprocessed data into a csv file

In [241]:
temp_data.to_csv('Maximum _ Minimum Temperature Data/VIC/wa_temperature_data.csv',index=False)

## Preprocess data for other states

In [None]:
# Preprocessing 

for state in states:
    temperature_file = "temperature_data/"+state+"/output_final.csv"
    information_file ="Combined_temperature_data/" + state + "/selected_station.csv"
    weather_data = pd.read_csv(temperature_file)
    selected_station=pd.read_csv(information_file)
    #Convert the string of date to datetime type
    selected_station['Selected starting date'] = pd.to_datetime(selected_station['Selected starting date'])
    selected_station['Selected ending date'] = pd.to_datetime(selected_station['Selected ending date'])
    weather_data=weather_data.drop(columns=['Unnamed: 0','Unnamed: 0.1'])
    weather_data=weather_data.rename(columns={'Bureau of Meteorology station number':'Station Number'})
    weather_data=weather_data[weather_data['Station Number'].isin(selected_station['Station Number'])]
    weather_data=weather_data[weather_data['Year']>=2000]
    weather_data['Date Time']=pd.to_datetime(weather_data['Year'].astype(str)+'-'+weather_data['Month'].astype(str)+'-'+weather_data['Day'].astype(str))
    #Calculate Mean temperature
    mean_max_temp=weather_data.groupby(['Station Number','Month'])['Maximum temperature (Degree C)'].mean().reset_index()
    mean_min_temp=weather_data.groupby(['Station Number','Month'])['Minimum temperature (Degree C)'].mean().reset_index()
    
    for index,row in selected_station.iterrows():
    station_no=row['Station Number']
    end_date = row['Selected ending date']
    weather_data=weather_data.drop(weather_data[(weather_data['Station Number']==station_no) & (pd.to_datetime(weather_data['Date Time'])>pd.to_datetime(end_date))].index)
    
    #Generate missing for temperature data
    temp_data = pd.DataFrame()
    for station_number in selected_station['Station Number']:
    # Declare output dataframe
    station_temp = weather_data[weather_data['Station Number']==station_number]
    if selected_station[selected_station['Station Number']==station_number]['Selected starting date'].values[0] < pd.to_datetime(selected_station[selected_station['Station Number']==station_number]['Starting Date recording min'].values[0]):
        #Use date_range to generate date for a given range
        idx = pd.date_range(pd.to_datetime( selected_station[selected_station['Station Number']==station_number]['Selected starting date'].values[0]),pd.to_datetime( selected_station[selected_station['Station Number']==station_number]['Selected ending date'].values[0]))
        station_temp=station_temp.set_index('Date Time').reindex(idx).reset_index().rename(columns={'index':'Date'})
        #File the missing Station number, product code,... (except max and min temperature)
        station_temp['Station Number']=station_temp['Station Number'].fillna(station_number)
        station_temp['Product code_x']=station_temp['Product code_x'].fillna(weather_data[weather_data['Station Number']==station_number]['Product code_x'].values[0])
        station_temp['stationname']=station_temp['stationname'].fillna(weather_data[weather_data['Station Number']==station_number]['stationname'].values[0])
        station_temp['latitude']=station_temp['latitude'].fillna(weather_data[weather_data['Station Number']==station_number]['latitude'].values[0])
        station_temp['longtitude']=station_temp['longtitude'].fillna(weather_data[weather_data['Station Number']==station_number]['longtitude'].values[0])
        station_temp['localgovernmentarea']=station_temp['localgovernmentarea'].fillna(weather_data[weather_data['Station Number']==station_number]['localgovernmentarea'].values[0])
    temp_data=pd.concat([temp_data,station_temp])
    
    #Change column name
    temp_data['Date Time']=temp_data['Date Time'].fillna(temp_data['Date'])
    temp_data=temp_data.drop(columns=['Date'])
    temp_data['Month'] = temp_data['Date Time'].dt.month
    
    #Find the missing data
    missing_max_data = temp_data[np.isnan(temp_data['Maximum temperature (Degree C)'])==True]
    missing_min_data= temp_data[np.isnan(temp_data['Minimum temperature (Degree C)'])==True]
    
    temp_data['Minimum temperature (Degree C)']=temp_data.apply(lambda row : filling_missing_temp(row,'Minimum temperature (Degree C)',mean_min_temp),axis=1)
    temp_data['Maximum temperature (Degree C)']=temp_data.apply(lambda row : filling_missing_temp(row,'Maximum temperature (Degree C)',mean_max_temp),axis=1)
    
    ouput_file = "Maximum _ Minimum Temperature Data/"+state+"/wa_temperature_data.csv"
    temp_data.to_csv(ouput_file,index=False)