In [1]:
import os
import pandas as pd

In [2]:
# define directory path and create empty dictionary to hold dataframe keys

directory_path = "/Users/mattzavala/Documents/Data Science Bootcamp/Project 4/CARE_To_Compare/Wind Farm A/datasets"
dataframes = {}

In [3]:
for filename in os.listdir(directory_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(directory_path, filename)
        try:
            df = pd.read_csv(file_path, sep=";")
            key = os.path.splitext(filename)[0]
            dataframes[key] = df
        except pd.errors.EmptyDataError:
            print(f"Warning: {file_path} is empty and will be skipped.")
        except pd.errors.ParserError:
            print(f"Error: {file_path} does not contain any columns to parse or is malformed.")
        except Exception as e:
            print(f"An unexpected error occurred while reading {file_path}: {e}")


for key, df in dataframes.items():
    print(f"DataFrame for {key}:")
    print(df.head())


DataFrame for 68:
            time_stamp  asset_id  id train_test  status_type_id  sensor_0_avg  \
0  2014-07-29 13:20:00        11   0      train               5          31.0   
1  2014-07-29 13:30:00        11   1      train               5          31.0   
2  2014-07-29 13:40:00        11   2      train               5          31.0   
3  2014-07-29 13:50:00        11   3      train               0          32.0   
4  2014-07-29 14:00:00        11   4      train               0          32.0   

   sensor_1_avg  sensor_2_avg  wind_speed_3_avg  wind_speed_4_avg  ...  \
0         152.0          48.7               3.9               3.9  ...   
1          86.1         150.9               6.0               6.0  ...   
2         115.2          69.6               6.3               6.3  ...   
3         129.3         -29.1               6.0               5.9  ...   
4         137.7          26.4               7.1               6.9  ...   

   sensor_47  sensor_48  sensor_49  sensor_50  sen

In [4]:
for key, df in dataframes.items():
    # Check for null values in each DataFrame
    null_rows = df[df.isnull().any(axis=1)]
    
    # Print information about null rows
    if not null_rows.empty:
        print(f"DataFrame for {key} contains null values in the following rows:")
        print(null_rows)
    else:
        print(f"DataFrame for {key} does not contain any null values.")

DataFrame for 68 does not contain any null values.
DataFrame for 40 contains null values in the following rows:
                time_stamp  asset_id     id  train_test  status_type_id  \
55632  2023-01-22 19:00:00        10  55632  prediction               0   
55633  2023-01-22 19:40:00        10  55633  prediction               0   

       sensor_0_avg  sensor_1_avg  sensor_2_avg  wind_speed_3_avg  \
55632          12.0         301.1           2.6               3.5   
55633          11.0         283.9          -9.0               4.0   

       wind_speed_4_avg  ...  sensor_47  sensor_48  sensor_49  sensor_50  \
55632               3.5  ...     -635.0     -270.0        0.0    -1043.0   
55633               3.7  ...     -510.0        3.0        0.0     -180.0   

       sensor_51  sensor_52_avg  sensor_52_max  sensor_52_min  sensor_52_std  \
55632     -905.0            5.4           13.0            0.0            4.6   
55633     -507.0            5.1           12.7            0.0    

In [5]:
#Drop rows with null values

for key, df in dataframes.items():
    dataframes[key] = df.dropna()
    print(f"Dropped null rows from DataFrame for {key}. New shape: {dataframes[key].shape}")

Dropped null rows from DataFrame for 68. New shape: (54358, 86)
Dropped null rows from DataFrame for 40. New shape: (56156, 86)
Dropped null rows from DataFrame for 69. New shape: (54812, 86)
Dropped null rows from DataFrame for 42. New shape: (53883, 86)
Dropped null rows from DataFrame for 0. New shape: (54985, 86)
Dropped null rows from DataFrame for 84. New shape: (53771, 86)
Dropped null rows from DataFrame for 92. New shape: (54067, 86)
Dropped null rows from DataFrame for 3. New shape: (55485, 86)
Dropped null rows from DataFrame for 45. New shape: (53739, 86)
Dropped null rows from DataFrame for 22. New shape: (53035, 86)
Dropped null rows from DataFrame for 25. New shape: (54712, 86)
Dropped null rows from DataFrame for 24. New shape: (55003, 86)
Dropped null rows from DataFrame for 26. New shape: (53701, 86)
Dropped null rows from DataFrame for 17. New shape: (55087, 86)
Dropped null rows from DataFrame for 14. New shape: (54197, 86)
Dropped null rows from DataFrame for 10. N

In [6]:
# Iterate over the dataframes dictionary
for key, df in dataframes.items():
    # Check if the column "train_test" exists in the DataFrame
    if "train_test" in df.columns:
        # Drop the column "train_test" from the DataFrame
        dataframes[key] = df.drop(columns=["train_test"])

        # Optionally, print information about the dropped column
        print(f"Dropped column 'train_test' from DataFrame for {key}. New shape: {dataframes[key].shape}")
    else:
        # Optionally, print a message if the column doesn't exist
        print(f"DataFrame for {key} does not contain a column named 'train_test'.")

Dropped column 'train_test' from DataFrame for 68. New shape: (54358, 85)
Dropped column 'train_test' from DataFrame for 40. New shape: (56156, 85)
Dropped column 'train_test' from DataFrame for 69. New shape: (54812, 85)
Dropped column 'train_test' from DataFrame for 42. New shape: (53883, 85)
Dropped column 'train_test' from DataFrame for 0. New shape: (54985, 85)
Dropped column 'train_test' from DataFrame for 84. New shape: (53771, 85)
Dropped column 'train_test' from DataFrame for 92. New shape: (54067, 85)
Dropped column 'train_test' from DataFrame for 3. New shape: (55485, 85)
Dropped column 'train_test' from DataFrame for 45. New shape: (53739, 85)
Dropped column 'train_test' from DataFrame for 22. New shape: (53035, 85)
Dropped column 'train_test' from DataFrame for 25. New shape: (54712, 85)
Dropped column 'train_test' from DataFrame for 24. New shape: (55003, 85)
Dropped column 'train_test' from DataFrame for 26. New shape: (53701, 85)
Dropped column 'train_test' from DataFra

In [10]:
# Iterate over the dataframes dictionary
for key, df in dataframes.items():
    # Check if the column "train_test" exists in the DataFrame
    if "id" in df.columns:
        # Drop the column "train_test" from the DataFrame
        dataframes[key] = df.drop(columns=["id"])

        # Optionally, print information about the dropped column
        print(f"Dropped column 'id' from DataFrame for {key}. New shape: {dataframes[key].shape}")
    else:
        # Optionally, print a message if the column doesn't exist
        print(f"DataFrame for {key} does not contain a column named 'id'.")

Dropped column 'id' from DataFrame for 68. New shape: (54358, 84)
Dropped column 'id' from DataFrame for 40. New shape: (56156, 84)
Dropped column 'id' from DataFrame for 69. New shape: (54812, 84)
Dropped column 'id' from DataFrame for 42. New shape: (53883, 84)
Dropped column 'id' from DataFrame for 0. New shape: (54985, 84)
Dropped column 'id' from DataFrame for 84. New shape: (53771, 84)
Dropped column 'id' from DataFrame for 92. New shape: (54067, 84)
Dropped column 'id' from DataFrame for 3. New shape: (55485, 84)
Dropped column 'id' from DataFrame for 45. New shape: (53739, 84)
Dropped column 'id' from DataFrame for 22. New shape: (53035, 84)
Dropped column 'id' from DataFrame for 25. New shape: (54712, 84)
Dropped column 'id' from DataFrame for 24. New shape: (55003, 84)
Dropped column 'id' from DataFrame for 26. New shape: (53701, 84)
Dropped column 'id' from DataFrame for 17. New shape: (55087, 84)
Dropped column 'id' from DataFrame for 14. New shape: (54197, 84)
Dropped colu

In [11]:
output_directory = "Wind_Farm_B_Clean"

# Loop through the dataframes dictionary and export each DataFrame to a CSV file

for key, df in dataframes.items():
    # Create the full file path
    file_path = os.path.join(output_directory, f'{key}.csv')
    
    # Export the DataFrame to a CSV file
    df.to_csv(file_path, index=False, sep=',')

    # Optionally, print a message indicating the export was successful
    print(f'Exported {key} to {file_path}')

Exported 68 to Wind_Farm_B_Clean/68.csv
Exported 40 to Wind_Farm_B_Clean/40.csv
Exported 69 to Wind_Farm_B_Clean/69.csv
Exported 42 to Wind_Farm_B_Clean/42.csv
Exported 0 to Wind_Farm_B_Clean/0.csv
Exported 84 to Wind_Farm_B_Clean/84.csv
Exported 92 to Wind_Farm_B_Clean/92.csv
Exported 3 to Wind_Farm_B_Clean/3.csv
Exported 45 to Wind_Farm_B_Clean/45.csv
Exported 22 to Wind_Farm_B_Clean/22.csv
Exported 25 to Wind_Farm_B_Clean/25.csv
Exported 24 to Wind_Farm_B_Clean/24.csv
Exported 26 to Wind_Farm_B_Clean/26.csv
Exported 17 to Wind_Farm_B_Clean/17.csv
Exported 14 to Wind_Farm_B_Clean/14.csv
Exported 10 to Wind_Farm_B_Clean/10.csv
Exported 38 to Wind_Farm_B_Clean/38.csv
Exported 13 to Wind_Farm_B_Clean/13.csv
Exported 73 to Wind_Farm_B_Clean/73.csv
Exported 72 to Wind_Farm_B_Clean/72.csv
Exported 71 to Wind_Farm_B_Clean/71.csv
