<a href="https://colab.research.google.com/github/Chetanop2/Celebel_int/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data = {
    'LotArea': [1000, 1500, 2000, np.nan, 2500, 3000, 1200, 1800, 2200, 2800],
    'GrLivArea': [1200, 1500, 1800, 1300, np.nan, 2000, 1400, 1600, 1900, 2100],
    'GarageCars': [2, 1, 2, 0, 3, 2, np.nan, 1, 2, 3],
    'TotalBsmtSF': [800, 1000, 1200, 900, 1100, np.nan, 950, 1050, 1250, 1300],
    'BedroomAbvGr': [3, 2, 3, 2, 4, 3, 2, 3, 4, 3],
    'FullBath': [2, 1, 2, 1, 2, 2, 1, 2, 2, 2],
    'YearBuilt': [2000, 1980, 2010, 1995, 2005, 1975, 2008, 1990, 2015, 1988],
    'Neighborhood': ['NAmes', 'CollgCr', 'OldTown', 'NAmes', 'Somerst', 'OldTown', 'NAmes', 'CollgCr', 'Somerst', 'OldTown'],
    'HouseStyle': ['1Story', '2Story', '1Story', 'SFoyer', '2Story', '1Story', '2Story', '1Story', 'SFoyer', '2Story'],
    'PoolQC': [np.nan, np.nan, 'Ex', np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], # Example of a feature with many NaNs
    'SalePrice': [150000, 180000, 220000, 160000, 250000, 200000, 170000, 195000, 230000, 210000]
}
df = pd.DataFrame(data)

print("Original DataFrame head:")
print(df.head())
print("\nOriginal DataFrame info:")
df.info()
print("\nOriginal DataFrame missing values:")
print(df.isnull().sum())

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

print(f"\nNumerical Features: {numerical_features}")
print(f"Categorical Features: {categorical_features}")

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep any other columns not specified (if any)
)

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Get feature names after one-hot encoding for categorical features
# This helps in recreating a DataFrame with meaningful column names
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(ohe_feature_names)

# Convert processed data back to DataFrame for easier inspection and further engineering
X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names)

print("\nProcessed DataFrame head (after initial preprocessing):")
print(X_processed_df.head())
print("\nProcessed DataFrame info (after initial preprocessing):")
X_processed_df.info()

X_fe = X.copy()

X_fe['TotalRooms'] = X_fe['BedroomAbvGr'].fillna(0) + X_fe['FullBath'].fillna(0)

# Price per square foot (if we consider it as a feature for prediction, otherwise it's target related)
# If GrLivArea can be 0, handle division by zero
X_fe['AreaPerRoom'] = X_fe['GrLivArea'] / (X_fe['BedroomAbvGr'] + X_fe['FullBath']).replace(0, np.nan)
X_fe['AreaPerRoom'] = X_fe['AreaPerRoom'].fillna(0) # Fill NaNs created by division by zero

# Age of the house
current_year = 2025 # Use a realistic current year
X_fe['HouseAge'] = current_year - X_fe['YearBuilt']
X_fe['HouseAge'].fillna(X_fe['HouseAge'].median(), inplace=True) # Impute if YearBuilt had NaNs

X_fe['GarageToLivingRatio'] = (X_fe['GarageCars'] * 200) / X_fe['GrLivArea'].replace(0, np.nan) # Assuming 200 sqft per car
X_fe['GarageToLivingRatio'] = X_fe['GarageToLivingRatio'].fillna(0) # Fill NaNs

# Re-identify numerical and categorical features after feature engineering
numerical_features_fe = X_fe.select_dtypes(include=np.number).columns.tolist()
categorical_features_fe = X_fe.select_dtypes(include='object').columns.tolist()

preprocessor_fe = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_fe), # Re-use numerical pipeline
        ('cat', categorical_transformer, categorical_features_fe) # Re-use categorical pipeline
    ],
    remainder='passthrough'
)

# Apply preprocessing to the feature-engineered data
X_final_processed = preprocessor_fe.fit_transform(X_fe)

# Get feature names for the final processed DataFrame
ohe_feature_names_final = preprocessor_fe.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features_fe)
all_feature_names_final = numerical_features_fe + list(ohe_feature_names_final)

X_final_processed_df = pd.DataFrame(X_final_processed, columns=all_feature_names_final)

print("\nDataFrame head after Feature Engineering and final Preprocessing:")
print(X_final_processed_df.head())
print("\nDataFrame info after Feature Engineering and final Preprocessing:")
X_final_processed_df.info()

# --- 5. Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X_final_processed_df, y, test_size=0.2, random_state=42)

print(f"\nShape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Now X_train, X_test, y_train, y_test are ready for model training.

Original DataFrame head:
   LotArea  GrLivArea  GarageCars  TotalBsmtSF  BedroomAbvGr  FullBath  \
0   1000.0     1200.0         2.0        800.0             3         2   
1   1500.0     1500.0         1.0       1000.0             2         1   
2   2000.0     1800.0         2.0       1200.0             3         2   
3      NaN     1300.0         0.0        900.0             2         1   
4   2500.0        NaN         3.0       1100.0             4         2   

   YearBuilt Neighborhood HouseStyle PoolQC  SalePrice  
0       2000        NAmes     1Story    NaN     150000  
1       1980      CollgCr     2Story    NaN     180000  
2       2010      OldTown     1Story     Ex     220000  
3       1995        NAmes     SFoyer    NaN     160000  
4       2005      Somerst     2Story    NaN     250000  

Original DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------      

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_fe['HouseAge'].fillna(X_fe['HouseAge'].median(), inplace=True) # Impute if YearBuilt had NaNs
