In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load Data
file_path = 'data.csv.csv'
df = pd.read_csv(file_path)


def summarize_data(df):
    print("Data Summary:")
    print(df.info())
    print("\nMissing Values:\n", df.isnull().sum())
    print("\nFirst Few Rows:\n", df.head())

summarize_data(df)

num_features = df.select_dtypes(include=['int64', 'float64']).columns
cat_features = df.select_dtypes(include=['object']).columns

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# transformation
df_transformed = preprocessor.fit_transform(df)

# transformed data to DataFrame
processed_df = pd.DataFrame(df_transformed)

# Save data
processed_df.to_csv('processed_data.csv', index=False)
print("ETL Process Completed and saved.")


Data Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23737 entries, 0 to 23736
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Series_reference  23737 non-null  object 
 1   Period            23737 non-null  float64
 2   Data_value        20898 non-null  float64
 3   Suppressed        2839 non-null   object 
 4   STATUS            23737 non-null  object 
 5   UNITS             23737 non-null  object 
 6   Magnitude         23737 non-null  int64  
 7   Subject           23737 non-null  object 
 8   Group             23737 non-null  object 
 9   Series_title_1    23737 non-null  object 
 10  Series_title_2    23737 non-null  object 
 11  Series_title_3    23737 non-null  object 
 12  Series_title_4    0 non-null      float64
 13  Series_title_5    0 non-null      float64
dtypes: float64(4), int64(1), object(9)
memory usage: 2.5+ MB
None

Missing Values:
 Series_reference        0
Period        



ETL Process Completed and saved.
