In [1]:
import pandas as pd
import numpy as np

In [9]:
# Path to the weather data file
weather_data_path = 'D:\\File_auto\\0_UCL_CASA\\OneDrive - University College London\\Xiaoyi_dissertation\\Analysis\\Data\\meteostat\\meteostat2022.csv'

# Read the CSV file, handling the BOM
weather_df = pd.read_csv(weather_data_path, encoding='utf-8-sig')

# Check the first few rows and columns to ensure it reads correctly
print(weather_df.head())
print(weather_df.columns)


         date  tavg  tmin  tmax  prcp  snow   wdir  wspd  wpgt    pres  tsun
0  2022-01-01  11.8  10.5  13.1   NaN   NaN  202.0  17.7  35.2  1017.5   NaN
1  2022-01-02  10.5   8.5  11.5   NaN   NaN  229.0  22.2  40.8  1010.3   NaN
2  2022-01-03   8.3   7.1   9.7   NaN   NaN  232.0  19.0  38.9  1007.8   NaN
3  2022-01-04   3.8   0.2   7.0   NaN   NaN  295.0  13.5  31.5  1001.7   NaN
4  2022-01-05   1.6  -0.4   4.6   NaN   NaN  296.0  17.5  37.0  1013.4   NaN
Index(['date', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt',
       'pres', 'tsun'],
      dtype='object')


In [11]:
# Function to perform polynomial interpolation
def polynomial_interpolation(df, date_col, value_cols, max_degree=3):
    df[date_col] = pd.to_datetime(df[date_col], format='%Y-%m-%d')  # Adjust the date format
    min_date = df[date_col].min()
    max_date = df[date_col].max()
    
    # Create a complete date range
    date_range = pd.date_range(start=min_date, end=max_date, freq='D')
    
    # Create a new DataFrame with the complete date range
    full_df = pd.DataFrame(date_range, columns=[date_col])
    
    for col in value_cols:
        # Fit a polynomial to the non-null data
        non_null_data = df.dropna(subset=[col])
        x = (non_null_data[date_col] - min_date).dt.days.values
        y = non_null_data[col].values
        
        # Adjust polynomial degree based on the number of data points
        degree = min(max_degree, len(x) - 1)
        if degree < 1:
            full_df[col] = np.nan
            continue
        
        try:
            polynomial = np.poly1d(np.polyfit(x, y, degree))
        except np.linalg.LinAlgError:
            print(f"Failed to fit polynomial for {col} due to insufficient data.")
            full_df[col] = np.nan
            continue
        
        # Apply the polynomial to the full date range
        full_df[col] = polynomial((full_df[date_col] - min_date).dt.days.values)
        
        # Ensure interpolated values do not fall below the minimum original values
        min_value = non_null_data[col].min()
        full_df[col] = np.where(full_df[col] < min_value, min_value, full_df[col])
        
        # Round values to 2 decimal places
        full_df[col] = full_df[col].round(2)
    
    # Merge the interpolated values back into the original dataframe
    df = pd.merge(df, full_df, on=date_col, how='outer', suffixes=('', '_interpolated'))
    
    for col in value_cols:
        df[col] = df[col].combine_first(df[f'{col}_interpolated'])
        df.drop(columns=[f'{col}_interpolated'], inplace=True)
    
    return df

# Specify the columns to interpolate
value_columns = ['tavg', 'tmin', 'tmax', 'wdir', 'wspd', 'wpgt', 'pres']

# Perform polynomial interpolation on the weather data
interpolated_weather_df = polynomial_interpolation(weather_df, 'date', value_columns, max_degree=3)

# Save the interpolated data back to a new CSV file
interpolated_weather_file_path = weather_data_path.replace('.csv', '_interpolated.csv')
interpolated_weather_df.to_csv(interpolated_weather_file_path, index=False)
print(f"Interpolated weather data saved to {interpolated_weather_file_path}")


Interpolated weather data saved to D:\File_auto\0_UCL_CASA\OneDrive - University College London\Xiaoyi_dissertation\Analysis\Data\meteostat\meteostat2022_interpolated.csv
