In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyarrow import parquet

In [148]:
df_situ_avg = pd.read_parquet('/kaggle/input/preprocessed-new/situ_avg.parquet')
df_8days_1km_mean = pd.read_parquet('/kaggle/input/preprocessed-new/modis_8days_1km_mean.parquet')

In [3]:
df_situ_original = pd.read_parquet('/kaggle/input/thesis-parquet/Combined_Scandinavia.parquet')

In [4]:
# Extract hours, dates and years
df_situ_original['Hour'] = df_situ_original['Datetime'].dt.hour  # Extract the hour from the 'Datetime'
df_situ_original['Date'] = df_situ_original['Datetime'].dt.date  # Extract the date
df_situ_original['Year'] = df_situ_original['Datetime'].dt.year  # Extract the year for filtering

# Define day and night hour ranges
day_hours = [11, 12, 13, 14]
night_hours = [19, 20, 21, 22]

# Filter data to include only years from 2000 to 2022
df_situ_original = df_situ_original[(df_situ_original['Year'] >= 2000) & (df_situ_original['Year'] <= 2022)]

In [13]:
df_situ_original.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23340301 entries, 49964 to 412773
Data columns (total 43 columns):
 #   Column              Dtype         
---  ------              -----         
 0   Latitude            float64       
 1   Longitude           float64       
 2   Elevation           float64       
 3   Snow_depth          float64       
 4   T_air               float64       
 5   Precipitation       float64       
 6   RH                  float64       
 7   Datetime            datetime64[us]
 8   LWin                float64       
 9   LWOut               float64       
 10  SWin                float64       
 11  SWout               float64       
 12  T_surf              float64       
 13  T_soil_5            float64       
 14  T_soil_10           float64       
 15  T_soil_15           float64       
 16  T_soil_20           float64       
 17  T_soil_25           float64       
 18  T_soil_30           float64       
 19  T_soil_40           float64       
 20  T_s

In [5]:
# Filter data for day and night
day_data = df_situ_original[df_situ_original['Hour'].isin(day_hours)]
night_data = df_situ_original[df_situ_original['Hour'].isin(night_hours)]

# Group by 'Point' and 'Date' to calculate mean for each numeric column, ignoring NaNs
day_avg = (
    day_data.groupby(['Latitude', 'Longitude', 'Date'], as_index=False)
    .agg({'T_air': 'mean', 'Elevation': 'first'})  # Replace 'Temperature' with the actual column name for temperature
    .rename(columns={'T_air': 'T_air_day'})  # Rename the column
)

night_avg = (
    night_data.groupby(['Latitude', 'Longitude', 'Date'], as_index=False)
    .agg({'T_air': 'mean', 'Elevation': 'first'})  # Replace 'Temperature' with the actual column name for temperature
    .rename(columns={'T_air': 'T_air_night'})  # Rename the column
)

In [6]:
# Merge day and night averages on 'Point' and 'Date'
df_situ_avg = pd.merge(day_avg, night_avg, on=['Latitude', 'Longitude', 'Date'], how='outer')

In [8]:
df_situ_avg = df_situ_avg.drop(columns=['Elevation_y']).rename(columns={'Elevation_x': 'Elevation'})

In [11]:
df_situ_avg.to_parquet("situ_avg.parquet", index=False)

In [150]:
# Extract the actual date part using regex
df_8days_1km_mean['date'] = df_8days_1km_mean['system:index'].str.extract(r'(\d{4}_\d{2}_\d{2})')

In [151]:
# Step 1: Convert 'date' column to datetime.date format
df_8days_1km_mean['date'] = pd.to_datetime(df_8days_1km_mean['date'], format='%Y_%m_%d').dt.date

In [152]:
# Step 2: Round latitude and longitude for fuzzy matching
df_8days_1km_mean['Latitude'] = df_8days_1km_mean['Latitude'].round(5)
df_8days_1km_mean['Longitude'] = df_8days_1km_mean['Longitude'].round(5)

In [153]:
# Step 3: Convert specified columns in df2
columns_to_convert = ['LST_Day_1km', 'LST_Night_1km']  # Replace with the actual column names in df2
df_8days_1km_mean[columns_to_convert] = (df_8days_1km_mean[columns_to_convert] * 0.02) - 273.15

In [159]:
# Step 4: Perform the join
# Match on rounded lat/lon and Date
joined_df = pd.merge(
    eight_day_avg, df_8days_1km_mean,
    left_on=['Latitude', 'Longitude', 'Date'],
    right_on=['Latitude', 'Longitude', 'date'],
    how='inner'
)

In [161]:
joined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91525 entries, 0 to 91524
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Latitude       91525 non-null  float64       
 1   Longitude      91525 non-null  float64       
 2   8_day_period   91525 non-null  int32         
 3   T_air_day      83254 non-null  float64       
 4   T_air_night    79566 non-null  float64       
 5   Elevation      88891 non-null  float64       
 6   Date           91525 non-null  datetime64[ns]
 7   Data period    10394 non-null  object        
 8   LST_Day_1km    82870 non-null  float64       
 9   LST_Night_1km  83713 non-null  float64       
 10  Name           91525 non-null  object        
 11  QC_Day         91213 non-null  float64       
 12  QC_Night       90975 non-null  float64       
 13  .geo           91525 non-null  object        
dtypes: datetime64[ns](1), float64(9), int32(1), object(3)
memory usage: 9.

In [160]:
# Step 5: Drop redundant lat/lon columns from df2 if needed
joined_df.drop(columns=['date', 'system:index'], inplace=True)

In [21]:
joined_df[['Longitude', 'Latitude']].drop_duplicates().shape[0]

123

In [163]:
joined_df.to_parquet("8_joined_8day_1km_mean.parquet", index=False)

In [90]:
df_situ_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 821907 entries, 0 to 821906
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Latitude     821907 non-null  float64
 1   Longitude    821907 non-null  float64
 2   Date         821907 non-null  object 
 3   T_air_day    738083 non-null  float64
 4   Elevation    783331 non-null  float64
 5   T_air_night  701192 non-null  float64
dtypes: float64(5), object(1)
memory usage: 37.6+ MB


In [97]:
df_situ_avg['Date'] = pd.to_datetime(df_situ_avg['Date'])

# Step 1: Assign each date to an 8-day period within the year
df_situ_avg['Year'] = df_situ_avg['Date'].dt.year
df_situ_avg['Day_of_Year'] = df_situ_avg['Date'].dt.dayofyear

# Compute the 8-day period (1-based index)
df_situ_avg['8_day_period'] = ((df_situ_avg['Day_of_Year'] - 1) // 8) + 1

# Step 2: Handle edge cases for the last period of the year (remaining 5-6 days)
# Identify the number of 8-day periods in each year
df_situ_avg['Total_8_day_periods'] = df_situ_avg['Year'].apply(lambda year: (pd.Timestamp(f'{year}-12-31').day_of_year - 1) // 8 + 1)

# Step 3: Group by Latitude, Longitude, Year, and 8-day period, then calculate the average temperature
eight_day_avg = (
    df_situ_avg.groupby(['Latitude', 'Longitude', 'Year', '8_day_period'])
    .agg({'T_air_day': 'mean', 'T_air_night': 'mean', 'Elevation':'first', 'Date':'min'})  # Replace 'Temperature' with the actual column name
    .reset_index()
)

eight_day_avg.drop(columns=['Year'], inplace=True)

In [158]:
df_8days_1km_mean['date'] = pd.to_datetime(df_8days_1km_mean['date'])

In [124]:
eight_day_avg.sample(20)

Unnamed: 0,Latitude,Longitude,8_day_period,T_air_day,T_air_night,Elevation,Date
51080,68.0833,27.1833,14,4.25,-0.2,247.0,2011-04-15
101923,77.0,15.5,34,1.7,-1.0,10.0,2020-09-22
5059,65.4167,24.1333,21,11.719792,11.896875,13.0,2007-06-10
23215,66.7167,27.1667,46,0.12,-0.02,208.0,2013-12-27
22426,66.5833,26.0167,2,-9.3125,-9.115625,106.0,2012-01-09
45524,67.8167,27.75,19,13.403125,10.015625,243.0,2022-05-25
28102,67.15,20.65,24,14.278125,11.98125,359.0,2019-07-04
61249,68.4667,17.5,1,-6.196875,-6.665625,17.0,2016-01-01
81919,69.6833,18.9167,18,7.85625,5.534375,1.0,2012-05-16
36192,67.36218,26.63755,33,8.957187,7.155156,,2022-09-14


In [None]:
# Assuming 'day_temperature' and 'night_temperature' columns are in both dataframes
joined_df['day_diff'] = joined_df['LST_Day_1km'] - joined_df['T_air_day']
joined_df['night_diff'] = joined_df['LST_Night_1km'] - joined_df['T_air_night']

# Prepare data for scatter plot
scatter_data = joined_df[['day_diff', 'night_diff']]

# Create the scatter plot
plt.figure(figsize=(12, 9))
sns.scatterplot(data=scatter_data, x='day_diff', y='night_diff', alpha=0.7, edgecolor=None)

# Add labels and title
plt.xlabel('Day Temperature Difference (°C)', fontsize=16)
plt.ylabel('Night Temperature Difference (°C)', fontsize=16)
plt.title('Scatter Plot of Day vs Night Temperature Differences', fontsize=20)

# Show plot
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('dayvsnight.png')
plt.show()

In [None]:
# Prepare data for scatter plot
scatter_data = joined_df[['T_air_day', 'day_diff']].sample(6000)

# Create the scatter plot
plt.figure(figsize=(12, 9))
sns.regplot(
    data=scatter_data,
    x='T_air_day',
    y='day_diff',
    order=3,
    scatter_kws={'alpha': 0.7},  # Transparency for scatter points
    line_kws={'color': 'red'},  # Line color for the trend curve
    ci=None  # Removes confidence intervals
)

# Add labels and title
plt.axhline(y=0, color='blue', linestyle='--', linewidth=2, label='y = 0')
plt.xlabel('Day Temperature in SITU (°C)', fontsize=16)
plt.ylabel('Day Temperature Difference (°C)', fontsize=16)
plt.title('Scatter Plot of Day Temperature Difference vs Day Temperature in SITU (6000 Samples)', fontsize=20)

# Show plot
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('dayvsdiff.png')
plt.show()

In [None]:
# Prepare data for scatter plot
scatter_data = joined_df[['T_air_night', 'night_diff']].sample(6000)

# Create the scatter plot
plt.figure(figsize=(12, 9))
sns.regplot(
    data=scatter_data,
    x='T_air_night',
    y='night_diff',
    order=3,
    scatter_kws={'alpha': 0.7},  # Transparency for scatter points
    line_kws={'color': 'red'},  # Line color for the trend curve
    ci=None  # Removes confidence intervals
)

# Add labels and title
plt.axhline(y=0, color='blue', linestyle='--', linewidth=2, label='y = 0')
plt.xlabel('Night Temperature in SITU (°C)', fontsize=16)
plt.ylabel('Night Temperature Difference (°C)', fontsize=16)
plt.title('Scatter Plot of Night Temperature Difference vs Night Temperature in SITU (6000 Samples)', fontsize=20)

# Show plot
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('nightvsdiff.png')
plt.show()

In [None]:
df_8_no = pd.read_parquet('/kaggle/input/preprocessed-parquet/joined_8days_no.parquet')

In [None]:
df_8_no['day_diff'] = df_8_no['value_day'] - df_8_no['T_air_day']
df_8_no['night_diff'] = df_8_no['value_night'] - df_8_no['T_air_night']

In [None]:
df_8_no['qc_night'] = df_8_no['qc_night'].replace('').fillna(0)