# All

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'dataset.csv'  # Update this path to your dataset
data = pd.read_csv(file_path, low_memory=False)

# Print column names to verify
print("Column names in the dataset:")
print(data.columns.tolist())

# Update the customer identifier column name
id_column = 'Customer No'  # Change from 'customer_id' to 'Customer No'

# Check for 'Customer No' column
if id_column in data.columns:
    print(f"'{id_column}' column found in data.")
else:
    print(f"'{id_column}' column not found in data.")
    id_column = None  # Handle accordingly if not found

# Drop unwanted columns
# Update this list with the correct column names to delete
columns_to_delete = ['comment', 'cmment']  # Include all possible variations
columns_present = [col for col in columns_to_delete if col in data.columns]
if columns_present:
    data_cleaned = data.drop(columns=columns_present)
    print(f"Columns dropped: {columns_present}")
else:
    data_cleaned = data
    print("No columns dropped.")

# Save the cleaned data
data_cleaned.to_csv('original.csv', index=False)

# Select columns that contain 'unit' and 'pre' (adjust this logic as needed)
energy_columns = [col for col in data_cleaned.columns if 'unit' in col]
print(f"Energy columns selected: {energy_columns}")

# Ensure that energy_columns are not empty
if not energy_columns:
    raise ValueError("No energy columns found. Please adjust the column selection criteria.")

window_size = 4  # Adjust the window size as needed

# Define the sliding window function
def sliding_window_with_padding(df, window_size):
    sliding_data = []

    num_cols = len(df.columns)
    column_names = []

    # Create appropriate column names for the output DataFrame
    for i in range(window_size):
        lag = window_size - i - 1
        column_names += [f"{col}_t-{lag}" for col in df.columns]

    # Pad the DataFrame with NaNs at the beginning
    padded_df = pd.DataFrame(np.nan, index=range(window_size - 1), columns=df.columns)
    df_padded = pd.concat([padded_df, df], ignore_index=True)

    # Generate the sliding windows
    for i in range(len(df)):
        window = df_padded.iloc[i:i + window_size].values.flatten()
        sliding_data.append(window)

    # Create the DataFrame with sliding windows
    sliding_df = pd.DataFrame(sliding_data, columns=column_names)

    return sliding_df

# Apply the sliding window function
if id_column:
    # Process data for each customer separately
    grouped = data_cleaned.groupby(id_column)
    sliding_dfs = []
    for customer_id, group in grouped:
        group = group.reset_index(drop=True)
        # Only keep the energy columns
        df = group[energy_columns]
        # Apply sliding window function
        sliding_df = sliding_window_with_padding(df, window_size)
        # Add 'Customer No' column
        sliding_df[id_column] = customer_id
        # Append to list
        sliding_dfs.append(sliding_df)
    # Concatenate all sliding dataframes
    sliding_df = pd.concat(sliding_dfs, ignore_index=True)
else:
    # No 'Customer No', process entire data
    df = data_cleaned[energy_columns]
    sliding_df = sliding_window_with_padding(df, window_size)

output_file = 'modi_data.csv'
sliding_df.to_csv(output_file, index=False)

# Output messages
print(f"Sliding window data has been saved to {output_file}")
print("Cleaned original data has been saved to 'original.csv'")


Column names in the dataset:
['Customer No', 'post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit', 'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit', 'post_22_mar_unit', 'post_22_april_unit', 'post_22_may_unit', 'pre_22_june_unit', 'pre_22_july_unit', 'pre_22_aug_unit', 'pre_22_sep_unit', 'pre_22_oct_unit', 'pre_22_nov_unit', 'pre_22_dec_unit', 'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit', 'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_unit', 'pre_23_aug_unit', 'pre_23_sep_unit', 'pre_23_oct_unit', 'pre_23_nov_unit', 'pre_23_dec_unit', 'cmment']
'Customer No' column found in data.
Columns dropped: ['cmment']
Energy columns selected: ['post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit', 'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit', 'post_22_mar_unit', 'post_22_april_unit', 'post_22_may_unit', 'pre_22_june_unit', 'pre_22_july_unit',

## Postpaid

In [2]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'dataset.csv'  # Update this path to your dataset
data = pd.read_csv(file_path, low_memory=False)

# Print column names to verify
print("Column names in the dataset:")
print(data.columns.tolist())

# Update the customer identifier column name
id_column = 'Customer No'  # Change from 'customer_id' to 'Customer No'

# Check for 'Customer No' column
if id_column in data.columns:
    print(f"'{id_column}' column found in data.")
else:
    print(f"'{id_column}' column not found in data.")
    id_column = None  # Handle accordingly if not found

# Drop unwanted columns
# Update this list with the correct column names to delete
columns_to_delete = ['comment', 'cmment']  # Include all possible variations
columns_present = [col for col in columns_to_delete if col in data.columns]
if columns_present:
    data_cleaned = data.drop(columns=columns_present)
    print(f"Columns dropped: {columns_present}")
else:
    data_cleaned = data
    print("No columns dropped.")

# Save the cleaned data
data_cleaned.to_csv('original.csv', index=False)

# Select columns that contain 'unit' and 'pre' (adjust this logic as needed)
energy_columns = [col for col in data_cleaned.columns if 'unit' in col and 'post' in col]
print(f"Energy columns selected: {energy_columns}")

# Ensure that energy_columns are not empty
if not energy_columns:
    raise ValueError("No energy columns found. Please adjust the column selection criteria.")

window_size = 4  # Adjust the window size as needed

# Define the sliding window function
def sliding_window_with_padding(df, window_size):
    sliding_data = []

    num_cols = len(df.columns)
    column_names = []

    # Create appropriate column names for the output DataFrame
    for i in range(window_size):
        lag = window_size - i - 1
        column_names += [f"{col}_t-{lag}" for col in df.columns]

    # Pad the DataFrame with NaNs at the beginning
    padded_df = pd.DataFrame(np.nan, index=range(window_size - 1), columns=df.columns)
    df_padded = pd.concat([padded_df, df], ignore_index=True)

    # Generate the sliding windows
    for i in range(len(df)):
        window = df_padded.iloc[i:i + window_size].values.flatten()
        sliding_data.append(window)

    # Create the DataFrame with sliding windows
    sliding_df = pd.DataFrame(sliding_data, columns=column_names)

    return sliding_df

# Apply the sliding window function
if id_column:
    # Process data for each customer separately
    grouped = data_cleaned.groupby(id_column)
    sliding_dfs = []
    for customer_id, group in grouped:
        group = group.reset_index(drop=True)
        # Only keep the energy columns
        df = group[energy_columns]
        # Apply sliding window function
        sliding_df = sliding_window_with_padding(df, window_size)
        # Add 'Customer No' column
        sliding_df[id_column] = customer_id
        # Append to list
        sliding_dfs.append(sliding_df)
    # Concatenate all sliding dataframes
    sliding_df = pd.concat(sliding_dfs, ignore_index=True)
else:
    # No 'Customer No', process entire data
    df = data_cleaned[energy_columns]
    sliding_df = sliding_window_with_padding(df, window_size)

output_file = 'post_modi_data.csv'
sliding_df.to_csv(output_file, index=False)

# Output messages
print(f"Sliding window data has been saved to {output_file}")
print("Cleaned original data has been saved to 'original.csv'")


Column names in the dataset:
['Customer No', 'post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit', 'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit', 'post_22_mar_unit', 'post_22_april_unit', 'post_22_may_unit', 'pre_22_june_unit', 'pre_22_july_unit', 'pre_22_aug_unit', 'pre_22_sep_unit', 'pre_22_oct_unit', 'pre_22_nov_unit', 'pre_22_dec_unit', 'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit', 'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_unit', 'pre_23_aug_unit', 'pre_23_sep_unit', 'pre_23_oct_unit', 'pre_23_nov_unit', 'pre_23_dec_unit', 'cmment']
'Customer No' column found in data.
Columns dropped: ['cmment']
Energy columns selected: ['post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit', 'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit', 'post_22_mar_unit', 'post_22_april_unit', 'post_22_may_unit']
Sliding window data has been saved to p

## Prepaid

In [3]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'dataset.csv'  # Update this path to your dataset
data = pd.read_csv(file_path, low_memory=False)

# Print column names to verify
print("Column names in the dataset:")
print(data.columns.tolist())

# Update the customer identifier column name
id_column = 'Customer No'  # Change from 'customer_id' to 'Customer No'

# Check for 'Customer No' column
if id_column in data.columns:
    print(f"'{id_column}' column found in data.")
else:
    print(f"'{id_column}' column not found in data.")
    id_column = None  # Handle accordingly if not found

# Drop unwanted columns
# Update this list with the correct column names to delete
columns_to_delete = ['comment', 'cmment']  # Include all possible variations
columns_present = [col for col in columns_to_delete if col in data.columns]
if columns_present:
    data_cleaned = data.drop(columns=columns_present)
    print(f"Columns dropped: {columns_present}")
else:
    data_cleaned = data
    print("No columns dropped.")

# Save the cleaned data
data_cleaned.to_csv('original.csv', index=False)

# Select columns that contain 'unit' and 'pre' (adjust this logic as needed)
energy_columns = [col for col in data_cleaned.columns if 'unit' in col and 'pre' in col]
print(f"Energy columns selected: {energy_columns}")

# Ensure that energy_columns are not empty
if not energy_columns:
    raise ValueError("No energy columns found. Please adjust the column selection criteria.")

window_size = 4  # Adjust the window size as needed

# Define the sliding window function
def sliding_window_with_padding(df, window_size):
    sliding_data = []

    num_cols = len(df.columns)
    column_names = []

    # Create appropriate column names for the output DataFrame
    for i in range(window_size):
        lag = window_size - i - 1
        column_names += [f"{col}_t-{lag}" for col in df.columns]

    # Pad the DataFrame with NaNs at the beginning
    padded_df = pd.DataFrame(np.nan, index=range(window_size - 1), columns=df.columns)
    df_padded = pd.concat([padded_df, df], ignore_index=True)

    # Generate the sliding windows
    for i in range(len(df)):
        window = df_padded.iloc[i:i + window_size].values.flatten()
        sliding_data.append(window)

    # Create the DataFrame with sliding windows
    sliding_df = pd.DataFrame(sliding_data, columns=column_names)

    return sliding_df

# Apply the sliding window function
if id_column:
    # Process data for each customer separately
    grouped = data_cleaned.groupby(id_column)
    sliding_dfs = []
    for customer_id, group in grouped:
        group = group.reset_index(drop=True)
        # Only keep the energy columns
        df = group[energy_columns]
        # Apply sliding window function
        sliding_df = sliding_window_with_padding(df, window_size)
        # Add 'Customer No' column
        sliding_df[id_column] = customer_id
        # Append to list
        sliding_dfs.append(sliding_df)
    # Concatenate all sliding dataframes
    sliding_df = pd.concat(sliding_dfs, ignore_index=True)
else:
    # No 'Customer No', process entire data
    df = data_cleaned[energy_columns]
    sliding_df = sliding_window_with_padding(df, window_size)

output_file = 'pre_modi_data.csv'
sliding_df.to_csv(output_file, index=False)

# Output messages
print(f"Sliding window data has been saved to {output_file}")
print("Cleaned original data has been saved to 'original.csv'")


Column names in the dataset:
['Customer No', 'post_21_july_unit', 'post_21_august_unit', 'post_21_sep_unit', 'post_21_oct_unit', 'post_21_nov_unit', 'post_21_dec_unit', 'post_22_jan_unit', 'post_22_feb_unit', 'post_22_mar_unit', 'post_22_april_unit', 'post_22_may_unit', 'pre_22_june_unit', 'pre_22_july_unit', 'pre_22_aug_unit', 'pre_22_sep_unit', 'pre_22_oct_unit', 'pre_22_nov_unit', 'pre_22_dec_unit', 'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit', 'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_unit', 'pre_23_aug_unit', 'pre_23_sep_unit', 'pre_23_oct_unit', 'pre_23_nov_unit', 'pre_23_dec_unit', 'cmment']
'Customer No' column found in data.
Columns dropped: ['cmment']
Energy columns selected: ['pre_22_june_unit', 'pre_22_july_unit', 'pre_22_aug_unit', 'pre_22_sep_unit', 'pre_22_oct_unit', 'pre_22_nov_unit', 'pre_22_dec_unit', 'pre_23_jan_unit', 'pre_23_feb_unit', 'pre_23_mar_unit', 'pre_23_apr_unit', 'pre_23_may_unit', 'pre_23_june_unit', 'pre_23_jul_uni