In [11]:
import pandas as pd
import config_local.local_config as local_config

from sklearn.preprocessing import StandardScaler

In [12]:
def is_continuous(series, threshold=0.05):
    if series.dtype not in ['int64', 'float64']:
        return False  # not numeric
    
    ratio = series.nunique() / len(series)
    return ratio > threshold

In [13]:
train = pd.read_csv(local_config.TRAIN_PROCESS4_CSV)
test = pd.read_csv(local_config.TEST_PROCESS4_CSV)

In [14]:

X_train = train.drop(columns=["logSP"]).copy()
y = train["logSP"]
X_test  = test.copy()

X_rows = len(X_train)

train_numeric = X_train.select_dtypes(include='number')
train_numeric_cont = [col for col in train_numeric.columns if is_continuous(train_numeric[col])] 
train_numeric_desc = [col for col in train_numeric.columns if col not in train_numeric_cont] 

scaler = StandardScaler()

for col in X_train.columns:
    if col in train_numeric_cont:
        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col]  = scaler.transform(X_test[[col]])



In [15]:
X_train = pd.concat([X_train, y], axis = 1)

In [16]:
X_train.to_csv(__import__("pathlib").Path(local_config.TRAIN_PROCESS5_CSV).resolve().parent / "train_process5.csv", index=False)  
X_test.to_csv(__import__("pathlib").Path(local_config.TEST_PROCESS5_CSV).resolve().parent / "test_process5.csv", index=False)  

In [17]:
train_cols = set(train.columns)
test_cols  = set(test.columns)

In [18]:
common = train_cols.intersection(test_cols)
print("Common columns:", common)

Common columns: {'HalfBath', 'GarageYrBlt', 'LandContour', 'Foundation', 'Neighborhood', 'HeatingQC', 'Exterior1st', 'GarageType', 'GrLivArea', 'Exterior2nd', 'OverallQual', 'OpenPorchSF', 'ExterCond', 'Utilities', 'TotRmsAbvGrd', 'Fireplaces', 'ScreenPorch', 'LotFrontage', 'Condition2', 'LotArea', 'YearRemodAdd', 'YrSold', 'BsmtExposure', 'SaleType', 'BsmtHalfBath', 'PoolArea', 'BldgType', 'Functional', 'PavedDrive', 'MiscFeature', 'GarageArea', 'MoSold', 'GarageFinish', 'BsmtQual', 'GarageCond', 'KitchenQual', 'HouseStyle', 'BsmtUnfSF', 'CentralAir', 'Street', 'Electrical', 'Heating', '3SsnPorch', 'BsmtFinType1', 'BsmtFinSF1', 'RoofStyle', 'MasVnrArea', 'BsmtFinSF2', 'KitchenAbvGr', '1stFlrSF', 'GarageCars', 'LotShape', 'FireplaceQu', 'MSSubClass', 'EnclosedPorch', '2ndFlrSF', 'PoolQC', 'TotalBsmtSF', 'RoofMatl', 'Alley', 'MSZoning', 'Fence', 'BsmtCond', 'BedroomAbvGr', 'BsmtFinType2', 'YearBuilt', 'LowQualFinSF', 'GarageQual', 'FullBath', 'Condition1', 'MasVnrType', 'ExterQual', 'Wo

In [19]:
only_train = train_cols - test_cols
print("Columns only in train:", only_train)

Columns only in train: {'logSP'}


In [20]:
only_test = test_cols - train_cols
print("Columns only in test:", only_test)

Columns only in test: set()
