In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Step 1: Load the dataset
df = pd.read_csv("preprocessing_efficiency_dataset.csv")
print("Initial Dataset:\n", df.head())

# Step 2: Drop ID column (not useful for training)
df.drop("ID", axis=1, inplace=True)

# Step 3: Define target variable
# Let's predict Processing_Time_ms based on text features
X = df.drop("Processing_Time_ms", axis=1)
y = df["Processing_Time_ms"]

# Step 4: Identify column types
numerical_cols = ["Contains_Numbers", "Contains_SpecialChars", "Cleaned_Text_Length"]
categorical_cols = ["Raw_Text"]

# Step 5: Define imputers
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

# Step 6: Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', num_imputer),
            ('scaler', StandardScaler())
        ]), numerical_cols),

        ('cat', Pipeline([
            ('imputer', cat_imputer),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

# Step 7: Handle missing target values
data = pd.concat([X, y], axis=1)
data.dropna(subset=["Processing_Time_ms"], inplace=True)

# Step 8: Update X and y after dropping
X = data.drop("Processing_Time_ms", axis=1)
y = data["Processing_Time_ms"]

# Step 9: Feature Engineering – Text Length (extra feature)
X["Raw_Text_Length"] = X["Raw_Text"].str.len()

# Step 10: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 11: Fit-transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Final shapes
print("X_train shape:", X_train_processed.shape)
print("X_test shape:", X_test_processed.shape)
print("y_train shape:", y_train.shape)


Initial Dataset:
    ID                  Raw_Text  Contains_Numbers  Contains_SpecialChars  \
0   1     efficiency testing                    0                      0   
1   2      123Numbers and words                 0                      1   
2   3  Data   Science  is great                 0                      1   
3   4  Data   Science  is great                 0                      0   
4   5     efficiency testing                    0                      0   

   Processing_Time_ms  Cleaned_Text_Length  
0                 112                   11  
1                 189                   13  
2                 102                   28  
3                  24                    5  
4                 116                   48  
X_train shape: (80, 13)
X_test shape: (20, 13)
y_train shape: (80,)
