In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import KNNImputer
import requests
import os
import datetime

In [67]:
def download_synops(year_start: int, year_end: int, start_month: int, end_month: int, path: str):
        """
            Download synop data from meteofrance
            :param year_start: start year
            :param year_end: end year
            :param start_month: start month
            :param end_month: end month
            :param path: path to save the csv file
            :return: list of downloaded files
        """
        
        # Base URL
        base_url = "https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop."
        
        # Create the directory if it doesn't exist
        os.makedirs(path, exist_ok=True)
        
        downloaded_files = []
        
        # Generate all months between start and end dates
        current_date = datetime.datetime(year_start, start_month, 1)
        end_date = datetime.datetime(year_end, end_month, 1)
        
        while current_date <= end_date:
            year_month = current_date.strftime("%Y%m")
            url = f"{base_url}{year_month}.csv.gz"
            output_file = os.path.join(path, f"synop.{year_month}.csv.gz")
            
            try:
                print(f"Downloading {url}...")
                response = requests.get(url)
                response.raise_for_status()
                
                with open(output_file, 'wb') as f:
                    f.write(response.content)
                
                downloaded_files.append(output_file)
                print(f"Downloaded to {output_file}")
                
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {url}: {e}")
            
            # Move to next month
            if current_date.month == 12:
                current_date = datetime.datetime(current_date.year + 1, 1, 1)
            else:
                current_date = datetime.datetime(current_date.year, current_date.month + 1, 1)
        
        print(f"Downloaded {len(downloaded_files)} files")
        return downloaded_files

# Download synop data
download_synops(2018, 2024, 1, 12, "synop_data")

def unzip(file_path: str):
    """
        Unzip the gz file
        :param file_path: path to the gz file
        :return: None
    """
    import gzip
    import shutil

    with gzip.open(file_path, 'rb') as f_in:
        with open(file_path[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(file_path)  # Remove the gz file after extraction
# Unzip all downloaded files
for file in os.listdir("synop_data"):
    if file.endswith(".gz"):
        unzip(os.path.join("synop_data", file))

Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201801.csv.gz...
Downloaded to synop_data/synop.201801.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201802.csv.gz...
Downloaded to synop_data/synop.201801.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201802.csv.gz...
Downloaded to synop_data/synop.201802.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201803.csv.gz...
Downloaded to synop_data/synop.201802.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201803.csv.gz...
Downloaded to synop_data/synop.201803.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archive/synop.201804.csv.gz...
Downloaded to synop_data/synop.201803.csv.gz
Downloading https://donneespubliques.meteofrance.fr/donnees_libres/Txt/Synop/Archi

In [139]:
class synop:
    def __init__(self):
        """
            This class will handle the synop data
        """
        self.data = None

    def convert_date(self):
        """
            This function will convert the date column to datetime format
            Date format imput: 20240601000000
            Date format output: 2024-06-01_00:00:00
        """
        self.data['date'] = pd.to_datetime(self.data['date'], format='%Y%m%d%H%M%S')
        self.data['date'] = self.data['date'].dt.strftime('%Y-%m-%d_%H:%M:%S')
        return self.data
    
    def column_to_keep(self, path:str, columns:list):
        """
            This function will keep only the columns that are needed for the analysis
            :param path: path to the csv file
            :param columns: list of columns to keep
            :return: dataframe with only the columns that are needed for the analysis
        """
        self.data = pd.read_csv(path, usecols=columns)
        return self.data
    
    def knn_imputer(self, path, columns):
        """
            This function will impute the missing values using KNN imputer of sklearn
            
            :param path: path to the csv file
            :param columns: list of column names to apply imputation to (if None, imputes all numeric columns)
            :return: dataframe with imputed values
        """
        
        try:
            print(f"Reading data from {path}")
            self.data = pd.read_csv(path)
            
            if columns is None:
                # If no specific columns are specified, use all numeric columns except 'date'
                numeric_cols = self.data.select_dtypes(include=np.number).columns.tolist()
                if 'date' in numeric_cols:
                    numeric_cols.remove('date')
                columns_to_impute = numeric_cols
            else:
                # Use the specified columns, ensuring they exist in the dataframe
                columns_to_impute = [col for col in columns if col in self.data.columns]
                if len(columns_to_impute) < len(columns):
                    missing = set(columns) - set(columns_to_impute)
                    print(f"Warning: The following columns were not found in the dataframe: {missing}")
            
            # Check if there are any columns left to impute
            if not columns_to_impute:
                print("No valid numeric columns to impute")
                return self.data
                
            print(f"Applying KNN imputation to columns: {columns_to_impute}")
            
            # Convert columns to numeric type if they aren't already
            for col in columns_to_impute:
                self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
            
            # Create a copy of the data for imputation
            data_to_impute = self.data[columns_to_impute].copy()
            
            # Apply KNN imputation
            imputer = KNNImputer(n_neighbors=5)
            imputed_data = imputer.fit_transform(data_to_impute)
            
            # Update only the specified columns with imputed values
            self.data[columns_to_impute] = imputed_data
            
            print(f"Successfully imputed missing values in {len(columns_to_impute)} columns")
            return self.data
            
        except Exception as e:
            print(f"Error during KNN imputation: {e}")
            import traceback
            traceback.print_exc()
            return self.data

    def data_merger(self, paths:list[str], sep=None, chunk_size=10):
        """
            Merge multiple csv files into one dataframe with improved error handling
            :param paths: list of paths to csv files
            :param sep: separator used in csv files (auto-detect if None)
            :param chunk_size: number of files to process at once to save memory
            :return: merged dataframe
        """
        if not paths:
            print("No files to merge")
            return None
        
        # Initialize empty dataframe
        self.data = pd.DataFrame()
        total_files = len(paths)
        processed_files = 0
        
        # Process files in chunks to avoid memory issues
        for i in range(0, total_files, chunk_size):
            chunk_paths = paths[i:i+chunk_size]
            chunk_dfs = []
            
            for path in chunk_paths:
                try:
                    # Auto-detect separator if not specified
                    if sep is None:
                        with open(path, 'r') as f:
                            first_line = f.readline().strip()
                            if ';' in first_line:
                                file_sep = ';'
                            else:
                                file_sep = ','
                    else:
                        file_sep = sep
                        
                    # Try reading with detected separator
                    df = pd.read_csv(path, low_memory=False, sep=file_sep)
                    chunk_dfs.append(df)
                    processed_files += 1
                    
                    # Print progress
                    if processed_files % 5 == 0 or processed_files == total_files:
                        print(f"Processed {processed_files}/{total_files} files ({processed_files/total_files*100:.1f}%)")
                        
                except Exception as e:
                    print(f"Error reading {path}: {e}")
            
            # Merge chunk of dataframes
            if chunk_dfs:
                chunk_merged = pd.concat(chunk_dfs, ignore_index=True)
                self.data = pd.concat([self.data, chunk_merged], ignore_index=True)
        
        print(f"Successfully merged {processed_files} out of {total_files} files")
        print(f"Resulting dataframe has {self.data.shape[0]} rows and {self.data.shape[1]} columns")
        return self.data

    def to_csv(self, path:str):
        """
            Save the dataframe to csv file
        """
        self.data.to_csv(path, index=False)
        return self.data
    
    def change_separator(self, path:str, separator:str = ";"):
        """
            Change the separator of the csv file
        """
        self.data = pd.read_csv(path, sep=separator)
        self.data.to_csv(path, sep=",", index=False)
        return self.data
    
    def filter_stations(self, path:str, stations:list):
        """
            Keep only the stations id that are in the list
            :param path: path to the csv file
            :param stations: list of stations to keep
            :return: dataframe with only the stations that are needed for the analysis
        """
        try:
            # Read the CSV file
            print(f"Reading data from {path}")
            data = pd.read_csv(path)
            # Ensure stations are strings and strip any whitespace
            stations = [str(s).strip() for s in stations]
            # Ensure 'numer_sta' is string and strip whitespace
            data['numer_sta'] = data['numer_sta'].astype(str).str.strip()
            # Filter the DataFrame using isin
            filtered_data = data[data['numer_sta'].isin(stations)]
            # If the filtered data is empty, print a warning
            if filtered_data.empty:
                print("Warning: No matching stations found!")
            self.data = filtered_data
            return self.data
        except Exception as e:
            print(f"Error filtering stations: {e}")
            import traceback
            traceback.print_exc()
            self.data = pd.DataFrame()
            return self.data

    def filter_time(self, path:str, hours:list):
        """
            Keep only the hours that are in the list
            :param path: path to the csv file
            :param hours: list of hours to keep
            :return: dataframe with only the hours that are needed for the analysis
        """
        try:
            # Read the csv file
            print(f"Reading data from {path}")
            data = pd.read_csv(path)
            # Convert hours to string
            hours = [str(h) for h in hours]
            # Convert 'date' column to datetime objects with specified format
            data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d_%H:%M:%S')
            # Extract hour from the 'date' column
            data['hour'] = data['date'].dt.hour.astype(str)
            # Filter the DataFrame using isin
            filtered_data = data[data['hour'].isin(hours)]
            # If the filtered data is empty, print a warning
            if filtered_data.empty:
                print("Warning: No matching hours found!")
            # Drop the temporary 'hour' column
            filtered_data = filtered_data.drop('hour', axis=1)
            self.data = filtered_data
            return self.data
        except Exception as e:
            print(f"Error filtering hours: {e}")
            import traceback
            traceback.print_exc()
            self.data = pd.DataFrame()
            return self.data

    def fill_missing_values(self, path:str):
        """
          If the nbas column is = 0 then replace the value of hbas, nnuage1, hnuage1 with 0 
        """
        try:
            # Read the CSV file
            print(f"Reading data from {path}")
            self.data = pd.read_csv(path)
            
            # Convert nbas to numeric (in case it's stored as string)
            self.data['nbas'] = pd.to_numeric(self.data['nbas'], errors='coerce')
            
            # Where nbas = 0, replace hbas, nnuage1, hnuage1 with 0
            mask = self.data['nbas'] == 0
            self.data.loc[mask, ['hbas', 'nnuage1', 'hnuage1']] = 0
            
            print(f"Filled missing values where nbas=0 for {mask.sum()} rows")
            return self.data
            
        except Exception as e:
            print(f"Error filling missing values: {e}")
            import traceback
            traceback.print_exc()
            return self.data
    
    def replace_mq(self, path:str):
        """
            Replace every 'mq' with NaN value
            :param path: path to the csv file
            :return: dataframe with 'mq' replaced by NaN
        """
        try:
            # Read the CSV file
            print(f"Reading data from {path}")
            self.data = pd.read_csv(path)
            
            # Replace 'mq' with NaN
            self.data = self.data.replace('mq', np.nan)
            
            # Convert numeric columns to appropriate types
            numeric_cols = ['ff', 't', 'td', 'u', 'nbas', 'hbas', 'pres', 'tminsol', 'rr12', 'nnuage1', 'hnuage1']
            for col in numeric_cols:
                if col in self.data.columns:
                    self.data[col] = pd.to_numeric(self.data[col], errors='coerce')
            
            print(f"Replaced 'mq' with NaN in dataframe with {self.data.shape[0]} rows")
            return self.data
            
        except Exception as e:
            print(f"Error replacing 'mq': {e}")
            import traceback
        
        
synop = synop()

In [69]:
# List of columns to keep
list_of_files = [os.path.join("synop_data", file) for file in os.listdir("synop_data") if file.endswith(".csv") and file != "synop.csv"]
print(f"Found {len(list_of_files)} CSV files to process")
synop.data_merger(list_of_files)
synop.to_csv("synop_data/synop.csv")

Found 84 CSV files to process
Processed 5/84 files (6.0%)
Processed 5/84 files (6.0%)
Processed 10/84 files (11.9%)
Processed 10/84 files (11.9%)
Processed 15/84 files (17.9%)
Processed 15/84 files (17.9%)
Processed 20/84 files (23.8%)
Processed 20/84 files (23.8%)
Processed 25/84 files (29.8%)
Processed 25/84 files (29.8%)
Processed 30/84 files (35.7%)
Processed 30/84 files (35.7%)
Processed 35/84 files (41.7%)
Processed 35/84 files (41.7%)
Processed 40/84 files (47.6%)
Processed 40/84 files (47.6%)
Processed 45/84 files (53.6%)
Processed 45/84 files (53.6%)
Processed 50/84 files (59.5%)
Processed 50/84 files (59.5%)
Processed 55/84 files (65.5%)
Processed 55/84 files (65.5%)
Processed 60/84 files (71.4%)
Processed 60/84 files (71.4%)
Processed 65/84 files (77.4%)
Processed 65/84 files (77.4%)
Processed 70/84 files (83.3%)
Processed 70/84 files (83.3%)
Processed 75/84 files (89.3%)
Processed 75/84 files (89.3%)
Processed 80/84 files (95.2%)
Processed 80/84 files (95.2%)
Processed 84/8

Unnamed: 0,numer_sta,date,pmer,tend,cod_tend,dd,ff,t,td,u,...,nnuage2,ctype2,hnuage2,nnuage3,ctype3,hnuage3,nnuage4,ctype4,hnuage4,Unnamed: 59
0,7005,20240601000000,102010,80,1,340,4.600000,285.150000,284.050000,93,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1,7015,20240601000000,101870,90,1,340,4.600000,286.350000,285.250000,93,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
2,7020,20240601000000,102330,50,2,40,2.900000,286.650000,284.650000,88,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
3,7027,20240601000000,102210,60,1,10,5.500000,285.750000,283.650000,87,...,8,mq,690,mq,mq,mq,mq,mq,mq,
4,7037,20240601000000,102000,100,1,360,6.400000,285.250000,284.150000,93,...,5,mq,510,8,mq,720,mq,mq,mq,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208350,81401,20240831210000,101020,70,3,260,3.400000,298.450000,295.750000,85,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1208351,81405,20240831210000,100990,-60,5,90,4.900000,303.350000,298.450000,75,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1208352,81408,20240831210000,101000,60,5,110,1.500000,303.150000,297.750000,73,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,
1208353,81415,20240831210000,100990,110,6,350,1.100000,300.850000,296.450000,77,...,mq,mq,mq,mq,mq,mq,mq,mq,mq,


In [70]:
synop.column_to_keep("synop_data/synop.csv", ['date', 'numer_sta', 'pres', 'ff', 't', 'td', 'u' ,'nbas', 'hbas', 'tminsol', 'nnuage1', 'hnuage1', 'rr12'])
synop.convert_date()
synop.to_csv("synop_data/synop.csv")

Unnamed: 0,numer_sta,date,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
0,7005,2024-06-01_00:00:00,4.600000,285.150000,284.050000,93,mq,mq,101140,286.150000,3.300000,mq,mq
1,7015,2024-06-01_00:00:00,4.600000,286.350000,285.250000,93,8,250,101300,287.650000,7.600000,8,240
2,7020,2024-06-01_00:00:00,2.900000,286.650000,284.650000,88,mq,mq,102220,mq,0.000000,mq,mq
3,7027,2024-06-01_00:00:00,5.500000,285.750000,283.650000,87,8,450,101400,287.750000,1.400000,2,450
4,7037,2024-06-01_00:00:00,6.400000,285.250000,284.150000,93,8,450,100110,286.350000,2.600000,3,330
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208350,81401,2024-08-31_21:00:00,3.400000,298.450000,295.750000,85,mq,mq,100960,mq,3.200000,mq,mq
1208351,81405,2024-08-31_21:00:00,4.900000,303.350000,298.450000,75,0,mq,100900,299.750000,-0.100000,mq,mq
1208352,81408,2024-08-31_21:00:00,1.500000,303.150000,297.750000,73,mq,mq,100920,mq,0.000000,mq,mq
1208353,81415,2024-08-31_21:00:00,1.100000,300.850000,296.450000,77,mq,mq,99790,mq,0.200000,mq,mq


In [92]:
synop.filter_stations("synop_data/synop.csv",["7110", "7130", "7027", "7222"])
synop.to_csv("synop_data/synop.csv")

Reading data from synop_data/synop.csv


Unnamed: 0,numer_sta,date,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
3,7027,2024-06-01_00:00:00,5.500000,285.750000,283.650000,87,8,450,101400,287.750000,1.400000,2,450
6,7110,2024-06-01_00:00:00,2.100000,284.950000,284.350000,96,6,1250,101240,282.550000,0.000000,6,1320
8,7130,2024-06-01_00:00:00,3.900000,287.350000,283.550000,78,6,800,101760,287.650000,0.000000,2,660
14,7222,2024-06-01_00:00:00,2.600000,286.350000,283.550000,83,1,800,101800,288.150000,0.000000,1,780
64,7027,2024-06-01_03:00:00,4.100000,285.150000,282.750000,85,8,450,101440,287.450000,1.200000,8,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1208247,7222,2024-08-31_18:00:00,2.000000,298.950000,293.050000,70,7,1250,101080,290.250000,0.000000,1,1050
1208296,7027,2024-08-31_21:00:00,2.000000,292.550000,291.050000,91,0,mq,100880,291.050000,0.000000,mq,mq
1208299,7110,2024-08-31_21:00:00,2.900000,289.550000,288.550000,94,2,7440,100390,290.150000,0.000000,2,7440
1208301,7130,2024-08-31_21:00:00,2.400000,294.950000,290.950000,78,5,6420,101040,291.550000,0.000000,2,6420


In [113]:
synop.filter_time("synop_data/synop.csv", ["9", "12", "15"])
synop.replace_mq("synop_data/synop.csv")
synop.to_csv("synop_data/synop.csv")

Reading data from synop_data/synop.csv
Error filtering hours: time data "2024-06-01 09:00:00" doesn't match format "%Y-%m-%d_%H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
Reading data from synop_data/synop.csv
Replaced 'mq' with NaN in dataframe with 30578 rows


Traceback (most recent call last):
  File "/var/folders/bn/r4hyss1n513d7wzwklr31yqh0000gn/T/ipykernel_2462/2023093969.py", line 152, in filter_time
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d_%H:%M:%S')
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/baptistecaillerie/Documents/Soaring AI/.venv/lib/python3.11/site-packages/pandas/core/tools/datetimes.py", line 1063, in to_datetime
    cache_array = _maybe_cache(arg, format, cache, convert_listlike)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/baptistecaillerie/Documents/Soaring AI/.venv/lib/python3.11/site-packages/pandas/core/tools/datetimes.py", line 247, in _maybe_cache
    cache_dates = convert_listlike(unique_dates, format)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/baptistecaillerie/Documents/Soaring AI/.venv/lib/python3.11/site-packages/pandas/core/tools/datetimes.py", line 433, in _convert_listlike

Unnamed: 0,numer_sta,date,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
0,7027,2024-06-01 09:00:00,5.0,285.95,281.25,73.0,8.0,800.0,101620.0,287.25,-0.1,8.0,600.0
1,7110,2024-06-01 09:00:00,7.7,288.65,281.15,61.0,0.0,0.0,101390.0,280.85,0.0,0.0,0.0
2,7130,2024-06-01 09:00:00,4.0,286.85,282.85,77.0,8.0,800.0,101970.0,286.95,0.0,7.0,600.0
3,7222,2024-06-01 09:00:00,4.4,288.25,282.85,70.0,8.0,800.0,102010.0,287.95,0.0,8.0,900.0
4,7027,2024-06-01 12:00:00,5.2,285.85,282.65,81.0,8.0,450.0,101710.0,287.25,-0.1,8.0,510.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30573,7222,2024-08-31 12:00:00,1.7,297.65,292.25,72.0,2.0,800.0,101310.0,289.95,-0.1,2.0,870.0
30574,7027,2024-08-31 15:00:00,3.1,295.25,289.75,71.0,7.0,800.0,100990.0,290.65,0.0,7.0,600.0
30575,7110,2024-08-31 15:00:00,4.3,296.05,290.75,72.0,1.0,800.0,100450.0,288.95,6.1,1.0,750.0
30576,7130,2024-08-31 15:00:00,1.9,297.45,290.95,67.0,6.0,1750.0,101170.0,288.55,0.0,6.0,1560.0


In [109]:
synop.fill_missing_values("synop_data/synop.csv")
synop.to_csv("synop_data/synop.csv")

Reading data from synop_data/synop.csv
Filled missing values where nbas=0 for 6087 rows


Unnamed: 0,numer_sta,date,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
0,7027,2024-06-01 09:00:00,5.000000,285.950000,281.250000,73,8.0,800,101620,287.250000,-0.100000,8,600
1,7110,2024-06-01 09:00:00,7.700000,288.650000,281.150000,61,0.0,0,101390,280.850000,0.000000,0,0
2,7130,2024-06-01 09:00:00,4.000000,286.850000,282.850000,77,8.0,800,101970,286.950000,0.000000,7,600
3,7222,2024-06-01 09:00:00,4.400000,288.250000,282.850000,70,8.0,800,102010,287.950000,0.000000,8,900
4,7027,2024-06-01 12:00:00,5.200000,285.850000,282.650000,81,8.0,450,101710,287.250000,-0.100000,8,510
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30573,7222,2024-08-31 12:00:00,1.700000,297.650000,292.250000,72,2.0,800,101310,289.950000,-0.100000,2,870
30574,7027,2024-08-31 15:00:00,3.100000,295.250000,289.750000,71,7.0,800,100990,290.650000,0.000000,7,600
30575,7110,2024-08-31 15:00:00,4.300000,296.050000,290.750000,72,1.0,800,100450,288.950000,6.100000,1,750
30576,7130,2024-08-31 15:00:00,1.900000,297.450000,290.950000,67,6.0,1750,101170,288.550000,0.000000,6,1560


In [115]:
#count the number of missing values
synop.data.isnull().sum()



numer_sta       0
date            0
ff             18
t               8
td             14
u              14
nbas          194
hbas          104
pres           11
tminsol       201
rr12          484
nnuage1       600
hnuage1      1336
dtype: int64

In [143]:
# Now call the knn_imputer with the list of columns
synop.knn_imputer("synop_data/synop.csv", ['ff', 't', 'td', 'u', 'pres', 'rr12', 'tminsol', 'hbas', 'nnuage1', 'hnuage1', 'nbas'])
synop.data.isnull().sum()
synop.to_csv("synop_data/synop.csv")

Reading data from synop_data/synop.csv
Applying KNN imputation to columns: ['ff', 't', 'td', 'u', 'pres', 'rr12', 'tminsol', 'hbas', 'nnuage1', 'hnuage1', 'nbas']
Successfully imputed missing values in 11 columns


Unnamed: 0,numer_sta,date,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
0,7027,2024-06-01 09:00:00,5.0,285.95,281.25,73.0,8.0,800.0,101620.0,287.25,-0.1,8.0,600.0
1,7110,2024-06-01 09:00:00,7.7,288.65,281.15,61.0,0.0,0.0,101390.0,280.85,0.0,0.0,0.0
2,7130,2024-06-01 09:00:00,4.0,286.85,282.85,77.0,8.0,800.0,101970.0,286.95,0.0,7.0,600.0
3,7222,2024-06-01 09:00:00,4.4,288.25,282.85,70.0,8.0,800.0,102010.0,287.95,0.0,8.0,900.0
4,7027,2024-06-01 12:00:00,5.2,285.85,282.65,81.0,8.0,450.0,101710.0,287.25,-0.1,8.0,510.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
30573,7222,2024-08-31 12:00:00,1.7,297.65,292.25,72.0,2.0,800.0,101310.0,289.95,-0.1,2.0,870.0
30574,7027,2024-08-31 15:00:00,3.1,295.25,289.75,71.0,7.0,800.0,100990.0,290.65,0.0,7.0,600.0
30575,7110,2024-08-31 15:00:00,4.3,296.05,290.75,72.0,1.0,800.0,100450.0,288.95,6.1,1.0,750.0
30576,7130,2024-08-31 15:00:00,1.9,297.45,290.95,67.0,6.0,1750.0,101170.0,288.55,0.0,6.0,1560.0


In [144]:
# Don't include numer_sta and the date columns in the final file
synop.data.drop(columns=['numer_sta', 'date'], inplace=True)
synop.to_csv("synop_data/synop_final.csv")

Unnamed: 0,ff,t,td,u,nbas,hbas,pres,tminsol,rr12,nnuage1,hnuage1
0,5.0,285.95,281.25,73.0,8.0,800.0,101620.0,287.25,-0.1,8.0,600.0
1,7.7,288.65,281.15,61.0,0.0,0.0,101390.0,280.85,0.0,0.0,0.0
2,4.0,286.85,282.85,77.0,8.0,800.0,101970.0,286.95,0.0,7.0,600.0
3,4.4,288.25,282.85,70.0,8.0,800.0,102010.0,287.95,0.0,8.0,900.0
4,5.2,285.85,282.65,81.0,8.0,450.0,101710.0,287.25,-0.1,8.0,510.0
...,...,...,...,...,...,...,...,...,...,...,...
30573,1.7,297.65,292.25,72.0,2.0,800.0,101310.0,289.95,-0.1,2.0,870.0
30574,3.1,295.25,289.75,71.0,7.0,800.0,100990.0,290.65,0.0,7.0,600.0
30575,4.3,296.05,290.75,72.0,1.0,800.0,100450.0,288.95,6.1,1.0,750.0
30576,1.9,297.45,290.95,67.0,6.0,1750.0,101170.0,288.55,0.0,6.0,1560.0
