In [40]:
import os
import pandas as pd

# Function to extract user number from filename
def user_number_to_sort(filename):
    """Extracts numeric part from filenames like 'U123.csv' for proper sorting"""
    if filename.endswith('.csv'):
        try:
            # Remove 'U' and '.csv', then convert to number
            return int(filename[1:-4])
        except ValueError:
            return 0  # Handle unexpected formats
    return 0  # For non-CSV files

In [53]:


# Extract file structure
root_path = 'C:/Users/dmarc/Desktop/EWELD_Project/EWELD/Electricity Consumption/'

# Initialize metadata container
metadata_ec = {
    'Section': [],
    'Division': [],
    'User': [],
    'Filename': [],
    'Row Count': []
}

# Process directories
for section in sorted(os.listdir(root_path)):
    section_path = os.path.join(root_path, section)
    
    if not os.path.isdir(section_path):
        continue  # Skip files
    
    for division in sorted(os.listdir(section_path)):
        division_path = os.path.join(section_path, division)
        
        if not os.path.isdir(division_path):
            continue  # Skip files
        
        # Get and sort CSV files using our custom function
        all_files = os.listdir(division_path)
        sorted_files = sorted(all_files, key=user_number_to_sort)
        
        # Process each CSV file
        for filename in sorted_files:
            if filename.endswith('.csv'):
                file_path = os.path.join(division_path, filename)
                
                # Count rows (excluding header)
                with open(file_path, 'r') as f:
                    row_count = sum(1 for line in f) - 1  # Subtract 1 for header
                
                # Append metadata
                metadata_ec['Section'].append(section)
                metadata_ec['Division'].append(division)
                metadata_ec['User'].append(filename[:-4])  # Remove .csv
                metadata_ec['Filename'].append(filename)
                metadata_ec['Row Count'].append(row_count)

# Create DataFrame
metadata_df_ec = pd.DataFrame(metadata_ec)

# Show results
print("First 20 entries:")
print(metadata_df_ec.head(20))
print("\nTotal files found:", len(metadata_df_ec))


First 20 entries:
   Section                                           Division User Filename  \
0        A  A01 Crop and animal production, hunting and re...   U1   U1.csv   
1        A  A01 Crop and animal production, hunting and re...   U2   U2.csv   
2        A  A01 Crop and animal production, hunting and re...   U3   U3.csv   
3        A  A01 Crop and animal production, hunting and re...   U4   U4.csv   
4        A                           A02 Forestry and logging   U5   U5.csv   
5        A                           A02 Forestry and logging   U6   U6.csv   
6        A                           A02 Forestry and logging   U7   U7.csv   
7        A                           A02 Forestry and logging   U8   U8.csv   
8        A                           A02 Forestry and logging   U9   U9.csv   
9        A                           A02 Forestry and logging  U10  U10.csv   
10       C                   C10 Manufacture of food products  U11  U11.csv   
11       C                   C10 M

Create Dataframe for folders name by unique Div_Sec from metadata_df_ec

In [None]:
metadata_df_ec_name = metadata_df_ec[['Division']].copy()
# Division Name (text after the first space)
metadata_df_ec_name['Division Name'] = metadata_df_ec_name['Division'].apply(lambda x: x.split(' ', 1)[1])
# Div_Sec: first 3 characters
metadata_df_ec_name['Div_Sec'] = metadata_df_ec_name['Division'].str[:3]
# Division code: last 2 characters of Div_Sec
metadata_df_ec_name['Division code'] = metadata_df_ec_name['Div_Sec'].str[1:]

#unique values by 'Div_Sec'
metadata_df_ec_unique = metadata_df_ec_name.drop_duplicates(subset=['Div_Sec'])
print(metadata_df_ec_unique.head())

print ("\nmetadata_df_ec rows no:", len(metadata_df_ec_name))

unique_Div_Sec = metadata_df_ec_name['Div_Sec'].unique()

print ("\nUnique Div_Sec rows no:", len(unique_Div_Sec))

                                             Division  \
0   A01 Crop and animal production, hunting and re...   
4                            A02 Forestry and logging   
10                   C10 Manufacture of food products   
18                       C11 Manufacture of beverages   
20                        C13 Manufacture of textiles   

                                        Division Name Div_Sec Division code  
0   Crop and animal production, hunting and relate...     A01            01  
4                                Forestry and logging     A02            02  
10                       Manufacture of food products     C10            10  
18                           Manufacture of beverages     C11            11  
20                            Manufacture of textiles     C13            13  

metadata_df_ec rows no: 386

Unique Div_Sec rows no: 45


In [45]:
unique_divisions = metadata_df_ec['Division'].unique()
division_unique_ec = [division.split(' ', 1)[1] for division in unique_divisions]
division_unique_ec_sorted = sorted(division_unique_ec)
print(division_unique_ec_sorted)
print ("\nUnique Division:", len(division_unique_ec_sorted))

['Accommodation', 'Activities auxiliary to financial service and insurance activities', 'Architectural and engineering activities; technical testing and analysis', 'Civil engineering', 'Computer programming, consultancy and related activities', 'Construction of buildings', 'Crop and animal production, hunting and related service activities', 'Education', 'Electricity, gas, steam and air conditioning supply', 'Financial service activities, except insurance and pension funding', 'Food and beverage service activities', 'Forestry and logging', 'Human health activities', 'Manufacture of basic metals', 'Manufacture of beverages', 'Manufacture of chemicals and chemical products', 'Manufacture of computer, electronic and optical products', 'Manufacture of electrical equipment', 'Manufacture of fabricated metal products, except machinery and equipment', 'Manufacture of food products', 'Manufacture of furniture', 'Manufacture of leather and related products', 'Manufacture of machinery and equipm

Files appendend

In [46]:
import os

filepath = 'C:/Users/dmarc/Desktop/EWELD_Project/EWELD/Electricity Consumption/'
sections = os.listdir(filepath)

### Generate dataframe with data for electricity consumption

In [47]:
import pandas as pd

dataframes = []

for section in sections:
    section_path = os.path.join(filepath, section)
    if os.path.isdir(section_path):
        divisions = os.listdir(section_path)
        for division in divisions:
            division_path = os.path.join(section_path, division)
            if os.path.isdir(division_path):
                files = os.listdir(division_path)
                csv_files = [f for f in files if f.endswith('.csv')]
                for csv_file in csv_files:
                    file_path = os.path.join(division_path, csv_file)
                    df = pd.read_csv(file_path)
                    df['User'] = os.path.splitext(csv_file)[0]      # e.g., 'U1'
                    df['Division'] = division                       # e.g., 'Division_01'
                    df['Section'] = section                         # e.g., 'Section_A'
                    dataframes.append(df)

# Combine all into one DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
print(combined_df.head())

                  Time    Value User  \
0  2016-07-12 10:30:00  20.3325   U1   
1  2016-07-12 10:45:00  16.8611   U1   
2  2016-07-12 11:00:00  16.2412   U1   
3  2016-07-12 11:15:00  18.8448   U1   
4  2016-07-12 11:30:00  17.3570   U1   

                                            Division Section  
0  A01 Crop and animal production, hunting and re...       A  
1  A01 Crop and animal production, hunting and re...       A  
2  A01 Crop and animal production, hunting and re...       A  
3  A01 Crop and animal production, hunting and re...       A  
4  A01 Crop and animal production, hunting and re...       A  


In [None]:
'''działa ale za skomplikowane do przeznaczenia'''
# import pandas as pd
# import re

# # Example data
# # combined_df = pd.DataFrame({'Division': ['A01 Crop production', 'C28 Manufacture of machinery', 'B15 Food products']})

# # Step 1: Extract Division Name (text after the first space)
# combined_df['Division Name'] = combined_df['Division'].apply(lambda x: x.split(' ', 1)[1])

# # Step 2: Extract Div_Sec (letter + digits part from the start, e.g., 'A01')
# combined_df['Div_Sec'] = combined_df['Division'].apply(
#     lambda x: re.match(r'[A-Z]\d+', x).group() if re.match(r'[A-Z]\d+', x) else ''
# )

# # Step 3: Extract Division code (only digits part from Div_Sec, e.g., '01')
# combined_df['Division code'] = combined_df['Div_Sec'].apply(
#     lambda x: re.search(r'\d+', x).group() if re.search(r'\d+', x) else ''
# )

# print(combined_df[['Division', 'Division Name', 'Div_Sec', 'Division code']].head())


                                            Division  \
0  A01 Crop and animal production, hunting and re...   
1  A01 Crop and animal production, hunting and re...   
2  A01 Crop and animal production, hunting and re...   
3  A01 Crop and animal production, hunting and re...   
4  A01 Crop and animal production, hunting and re...   

                                       Division Name Div_Sec Division code  
0  Crop and animal production, hunting and relate...     A01            01  
1  Crop and animal production, hunting and relate...     A01            01  
2  Crop and animal production, hunting and relate...     A01            01  
3  Crop and animal production, hunting and relate...     A01            01  
4  Crop and animal production, hunting and relate...     A01            01  


## Add Division Name, Div_Sec - Division Section and Division code

In [48]:
import pandas as pd


# Division Name (text after the first space)
combined_df['Division Name'] = combined_df['Division'].apply(lambda x: x.split(' ', 1)[1])

# Div_Sec: first 3 characters
combined_df['Div_Sec'] = combined_df['Division'].str[:3]

# Division code: last 2 characters of Div_Sec
combined_df['Division code'] = combined_df['Div_Sec'].str[1:]

# Rename the column 
combined_df = combined_df.rename(columns={'Value': 'Energy_cons [kWh]'})

# Now select and order the columns as you want
combined_ec_df = combined_df[['Division', 'Division Name', 'Div_Sec', 'Section', 'Division code', 'User', 'Time', 'Energy_cons [kWh]']]


print(combined_ec_df.head(3))

                                            Division  \
0  A01 Crop and animal production, hunting and re...   
1  A01 Crop and animal production, hunting and re...   
2  A01 Crop and animal production, hunting and re...   

                                       Division Name Div_Sec Section  \
0  Crop and animal production, hunting and relate...     A01       A   
1  Crop and animal production, hunting and relate...     A01       A   
2  Crop and animal production, hunting and relate...     A01       A   

  Division code User                 Time  Energy_cons [kWh]  
0            01   U1  2016-07-12 10:30:00            20.3325  
1            01   U1  2016-07-12 10:45:00            16.8611  
2            01   U1  2016-07-12 11:00:00            16.2412  


In [50]:
columns_list = combined_ec_df.columns.tolist()
print(columns_list)

['Division', 'Division Name', 'Div_Sec', 'Section', 'Division code', 'User', 'Time', 'Energy_cons [kWh]']


In [51]:
print("Data types:\n", combined_ec_df.dtypes)

Data types:
 Division              object
Division Name         object
Div_Sec               object
Section               object
Division code         object
User                  object
Time                  object
Energy_cons [kWh]    float64
dtype: object


In [38]:
num_rows = len(combined_ec_df)
print(f"Number of rows: {num_rows}")

Number of rows: 55277891


### save dataftame combined_ec_df as csv

In [39]:
directory = r'C:/Users/dmarc/Desktop/EWELD_Project/EWELD'
file_path = os.path.join(directory, 'appended_Electricity_Consumption.csv')

# Create the directory if it does not exist
#os.makedirs(directory, exist_ok=True)

# Save the DataFrame to a CSV file with the index column
combined_ec_df.to_csv(file_path, index=True)

### Save as pickle combined_ec_df

In [52]:
import os

directory = r'C:/Users/dmarc/Desktop/EWELD_Project/EWELD'
file_name = 'combined_ec_df.pkl'
file_path = os.path.join(directory, file_name)

# Create the directory if it does not exist
os.makedirs(directory, exist_ok=True)

# Save the DataFrame as a pickle file
combined_ec_df.to_pickle(file_path)

