In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

In [None]:
def load_calendar_data(file_path, sheet_name):
    return pd.read_excel(file_path, sheet_name=sheet_name)


In [None]:
def split_data(calendar_df, clndr_data_col):
    cal_data_columns = ["clndr_id", "Data", "type", "Date", "Hours", "Key"]
    cal_data_df = pd.DataFrame(columns=cal_data_columns)

    for i in range(len(calendar_df)):
        split_string = calendar_df.iloc[i, clndr_data_col]
        split_target = split_string.split('(0||')
        
        new_rows = pd.DataFrame({
            "clndr_id": [calendar_df.iloc[i, 1]] * len(split_target),  # Note the change to column index 1
            "Data": split_target
        })
        
        cal_data_df = pd.concat([cal_data_df, new_rows], ignore_index=True)

    cal_data_df.reset_index(drop=True, inplace=True)
    return cal_data_df


In [None]:
def classify_data(cal_data_df):
    def classify(data):
        if "DaysOfWeek" in data:
            return "DaysOfWeek"
        elif "Exceptions" in data:
            return "Exceptions"
        else:
            return np.nan

    cal_data_df['type'] = cal_data_df['Data'].apply(classify)
    return cal_data_df


In [93]:
def extract_date_and_hours(row):
    data = row['Data']
    row_type = row['type']
    
    date = np.nan
    hours = np.nan

    if row_type == "DaysOfWeek" and data.startswith("()"):
        date = data[1]
    elif row_type == "Exceptions" and "(d|)" in data:
        date = data.split("(d|")[1][:5]

    if "s|" in data and "f|" in data:
        s_index = data.index("s|") + 2
        f_index = data.index("f|") + 2
        start_time_str = data[s_index:s_index + 5].replace(":", "").zfill(4)
        end_time_str = data[f_index:f_index + 5].replace(":", "").zfill(4)

        # Ensure the extracted strings are valid time formats
        if len(start_time_str) == 4 and len(end_time_str) == 4:
            try:
                start_time = datetime.strptime(start_time_str, '%H%M')
                end_time = datetime.strptime(end_time_str, '%H%M')
                if start_time == end_time:
                    hours = 24
                else:
                    hours = (end_time - start_time).seconds / 3600  # Convert to hours
                    if start_time > end_time:
                        hours = 24 - hours
            except ValueError:
                # Handle the case where time conversion fails
                pass

    return pd.Series([date, hours])


In [None]:
def populate_date_and_hours(cal_data_df):
    cal_data_df[['Date', 'Hours']] = cal_data_df.apply(extract_date_and_hours, axis=1)

    cal_data_df['type'].fillna(method='ffill', inplace=True)

    for i in range(1, len(cal_data_df)):
        if pd.isna(cal_data_df.at[i, 'Date']) and cal_data_df.at[i, 'type'] == cal_data_df.at[i-1, 'type'] and \
           cal_data_df.at[i, 'clndr_id'] == cal_data_df.at[i-1, 'clndr_id'] and "VIEW(ShowTotal|Y)" not in cal_data_df.at[i, 'Data']:
            cal_data_df.at[i, 'Date'] = cal_data_df.at[i-1, 'Date']

    return cal_data_df


In [100]:
def main():
    file_path = 'TST00.xlsx'
    sheet_name = 'CALENDAR'
    
    calendar_df = load_calendar_data(file_path, sheet_name)
    clndr_data_col = calendar_df.columns.get_loc('clndr_data')
    
    cal_data_df = split_data(calendar_df, clndr_data_col)
    cal_data_df = classify_data(cal_data_df)
    cal_data_df = populate_date_and_hours(cal_data_df)


    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
    
    print(cal_data_df)

if __name__ == "__main__":
    main()


   clndr_id                           Data        type  Date  Hours  Key
0       993                                        NaN   NaN    NaN  NaN
1       993            CalendarData()(           NaN   NaN    NaN  NaN
2       993            DaysOfWeek()(      DaysOfWeek   NaN    NaN  NaN
3       993                   1()(        DaysOfWeek   NaN    NaN  NaN
4       993  0(s|07:00|f|17:00)())))      DaysOfWeek   NaN   10.0  NaN
5       993                   2()(        DaysOfWeek   NaN    NaN  NaN
6       993  0(s|07:00|f|17:00)())))      DaysOfWeek   NaN   10.0  NaN
7       993                   3()(        DaysOfWeek   NaN    NaN  NaN
8       993  0(s|07:00|f|17:00)())))      DaysOfWeek   NaN   10.0  NaN
9       993                   4()(        DaysOfWeek   NaN    NaN  NaN
10      993  0(s|07:00|f|17:00)())))      DaysOfWeek   NaN   10.0  NaN
11      993                   5()(        DaysOfWeek   NaN    NaN  NaN
12      993  0(s|07:00|f|17:00)())))      DaysOfW

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cal_data_df['type'].fillna(method='ffill', inplace=True)
  cal_data_df['type'].fillna(method='ffill', inplace=True)


In [96]:
cal_data_df

NameError: name 'cal_data_df' is not defined