# Import necessary libraries

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("deep")
plt.rcParams['figure.figsize'] = [14, 8]
plt.rcParams['font.size'] = 12

# Load datasets from Google Drive

In [None]:

base_path = "/content/drive/MyDrive/Design_Project/"

# Load each dataset
lfs_month = pd.read_csv(base_path + "lfs_month.csv")
lfs_month_sa = pd.read_csv(base_path + "lfs_month_sa.csv")
lfs_month_youth = pd.read_csv(base_path + "lfs_month_youth.csv")
lfs_month_duration = pd.read_csv(base_path + "lfs_month_duration.csv")
lfs_month_status = pd.read_csv(base_path + "lfs_month_status.csv")

# Display basic information about the datasets
print("Datasets loaded successfully!")

Datasets loaded successfully!


# Check the structure of each dataset

In [None]:

def examine_dataset(df, name):
    print(f"\n{'-'*40}\nDataset: {name}\n{'-'*40}")
    print(f"Shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nData types:")
    print(df.dtypes)
    print("\nBasic statistics:")
    print(df.describe())
    print("\nMissing values:")
    print(df.isnull().sum())
    print(f"{'-'*40}\n")

# Examine each dataset
examine_dataset(lfs_month, "Labor Force Survey Monthly")
examine_dataset(lfs_month_sa, "Labor Force Survey Monthly (Seasonally Adjusted)")
examine_dataset(lfs_month_youth, "Labor Force Survey Monthly (Youth)")
examine_dataset(lfs_month_duration, "Labor Force Survey Monthly (Unemployment Duration)")
examine_dataset(lfs_month_status, "Labor Force Survey Monthly (Employment Status)")


----------------------------------------
Dataset: Labor Force Survey Monthly
----------------------------------------
Shape: (182, 8)

First 5 rows:
         date       lf  lf_employed  lf_unemployed  lf_outside  p_rate  \
0  2010-01-01  12367.9      11931.2          436.7      6927.5    64.1   
1  2010-02-01  12059.8      11632.3          427.4      7225.1    62.5   
2  2010-03-01  12324.2      11895.9          428.3      6993.0    63.8   
3  2010-04-01  12512.0      12133.5          378.5      6894.5    64.5   
4  2010-05-01  12197.6      11798.9          398.7      7186.0    62.9   

   ep_ratio  u_rate  
0      61.8     3.5  
1      60.3     3.5  
2      61.6     3.5  
3      62.5     3.0  
4      60.9     3.3  

Data types:
date              object
lf               float64
lf_employed      float64
lf_unemployed    float64
lf_outside       float64
p_rate           float64
ep_ratio         float64
u_rate           float64
dtype: object

Basic statistics:
                 lf   lf_em

# Data Preprocessing for Time Series Analysis

In [None]:

def preprocess_time_series(df, date_col='date'):
    """
    Preprocess dataframe for time series analysis:
    - Convert date column to datetime
    - Set date as index
    - Sort by date
    """
    # Create a copy dataframe
    df_processed = df.copy()

    # Convert date string to datetime object
    df_processed[date_col] = pd.to_datetime(df_processed[date_col])

    # Set date as index
    df_processed.set_index(date_col, inplace=True)

    # Sort index
    df_processed.sort_index(inplace=True)

    return df_processed

# Process each dataset
lfs_month_ts = preprocess_time_series(lfs_month)
lfs_month_sa_ts = preprocess_time_series(lfs_month_sa)
lfs_month_youth_ts = preprocess_time_series(lfs_month_youth)
lfs_month_duration_ts = preprocess_time_series(lfs_month_duration)

# For status dataset, handling the variable column
if 'variable' in lfs_month_status.columns:
    
    # Checking unique values in variable column
    print("Unique values in variable column:", lfs_month_status['variable'].unique())

    if len(lfs_month_status['variable'].unique()) == 1:
        lfs_month_status = lfs_month_status.drop('variable', axis=1)
        lfs_month_status_ts = preprocess_time_series(lfs_month_status)
    else:
        # Handling multiple values by processing each separately
        unique_vars = lfs_month_status['variable'].unique()
        lfs_month_status_ts = {}

        for var in unique_vars:
            subset = lfs_month_status[lfs_month_status['variable'] == var].drop('variable', axis=1)
            lfs_month_status_ts[var] = preprocess_time_series(subset)

        print(f"Created {len(unique_vars)} separate dataframes for status data")
else:
    lfs_month_status_ts = preprocess_time_series(lfs_month_status)

# Printing information about the processed datasets
print("Time series datasets created with datetime index")
for name, df in zip(['lfs_month', 'lfs_month_sa', 'lfs_month_youth', 'lfs_month_duration'],
                    [lfs_month_ts, lfs_month_sa_ts, lfs_month_youth_ts, lfs_month_duration_ts]):
    print(f"{name}: {df.shape} rows from {df.index.min()} to {df.index.max()}")

Unique values in variable column: ['persons' 'share']
Created 2 separate dataframes for status data
Time series datasets created with datetime index
lfs_month: (182, 7) rows from 2010-01-01 00:00:00 to 2025-02-01 00:00:00
lfs_month_sa: (182, 5) rows from 2010-01-01 00:00:00 to 2025-02-01 00:00:00
lfs_month_youth: (110, 4) rows from 2016-01-01 00:00:00 to 2025-02-01 00:00:00
lfs_month_duration: (110, 7) rows from 2016-01-01 00:00:00 to 2025-02-01 00:00:00
