In [None]:
import pandas as pd

# Load the dataset
file_path = 'Inflation-data.xlsx'
sheet_data = pd.read_excel(file_path, sheet_name='Sheet1')

# Step 1: Remove unnecessary columns
cleaned_data = sheet_data.drop(columns=["Data source", "Note", "Note.1"], errors="ignore")

# Step 2: Remove rows with 30% or less valid data
row_threshold = int(0.3 * cleaned_data.shape[1])  # 30% of the total columns
cleaned_data = cleaned_data[cleaned_data.notnull().sum(axis=1) > row_threshold]

# Step 3: Remove columns with 30% or less valid data
col_threshold = int(0.3 * cleaned_data.shape[0])  # 30% of the total rows
cleaned_data = cleaned_data.loc[:, cleaned_data.notnull().sum(axis=0) > col_threshold]

# Step 4: Ensure year-related data is numeric
year_columns = [col for col in cleaned_data.columns if str(col).isdigit()]
cleaned_data[year_columns] = cleaned_data[year_columns].apply(pd.to_numeric, errors='coerce')

# Step 5: Fill missing values with advanced methods
for col in year_columns:
    cleaned_data[col] = cleaned_data[col].interpolate(method='pchip', limit_direction='both')
    cleaned_data[col] = cleaned_data[col].clip(lower=0)  # Ensure no negative values

# Save the cleaned data to a new Excel file
output_file_path = 'Cleaned_Dataset.xlsx'
cleaned_data.to_excel(output_file_path, index=False)
