# Preprocessing A1 worksheet

In [1]:
import pandas as pd

In [None]:
# File path for your CSV file
file_path = '../data/a1_worksheet.csv'

# Read the CSV file, skipping metadata rows, and use appropriate header rows
df = pd.read_csv(file_path, skiprows=3, )  # Adjust `skiprows` as needed to align with your dataset structure
# remove the first 2 rows 
df = df.iloc[3:]
# rename first column 'Description' to 'Year'
df.rename(columns={'Description': 'Year'}, inplace=True)

# remove all columns except
columns_to_keep = ["Year", "Real GDP at market prices", "Unemployment rate", "Consumer price inflation"]
df = df[columns_to_keep]

# Replace all ',' with '.'
df = df.replace(',', '.', regex=True)

# Normalize the minus sign (replace Unicode "−" with ASCII "-")
df = df.replace({'−': '-'}, regex=True)

# Convert all values to numeric, raising an error if conversion fails
df = df.apply(pd.to_numeric, errors='raise')

# Filter rows for the period 1900-2000 (assuming 'Year' is a column) and set index to zero based
df = df[(df['Year'] >= 1900) & (df['Year'] <= 2000)].reset_index(drop=True)


In [5]:
df.shape

(101, 4)

In [3]:
df.head()

Unnamed: 0,Year,Real GDP at market prices,Unemployment rate,Consumer price inflation
0,1900,181160.0,3.68,4.06
1,1901,184975.0,4.88,-0.3
2,1902,187757.0,5.15,0.0
3,1903,186016.0,5.6,1.1
4,1904,188156.0,6.91,-0.4


In [4]:
df.tail()

Unnamed: 0,Year,Real GDP at market prices,Unemployment rate,Consumer price inflation
96,1996,1243709.0,8.1,2.4
97,1997,1282602.0,6.97,1.82
98,1998,1323527.0,6.26,1.56
99,1999,1366983.0,5.98,1.33
100,2000,1418176.0,5.46,0.8


In [6]:
# Save the cleaned data to a new CSV file
output_file_path = '../data/a1_worksheet_cleaned.csv'
df.to_csv(output_file_path, index=False)