In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor # Neural Network
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [None]:
df = pd.read_csv('/content/gurgaon_properties_feature_engg_2.csv')

In [None]:
df.columns

Index(['property_type', 'sector', 'price', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'study room', 'servant room',
       'store room', 'pooja room', 'others', 'furnishing_type',
       'luxury_category', 'floor_category'],
      dtype='object')

 ### I am undoing label encoder, because, I want to create pipeline which take categorical value from use and all transformations applied then we will get the predicted value
 * Columns which have been removed while feature selection, i will also remove here
 * I used this data because , i want most of the columns where label encoder were not used  

In [None]:
df.drop(columns= ['study room', 'pooja room', 'others' ], inplace=True)

In [None]:
df.shape

(3554, 13)

In [None]:
df['furnishing_type'].value_counts()

Unnamed: 0_level_0,count
furnishing_type,Unnamed: 1_level_1
1,2374
2,995
0,185


### reverting this label encoded column back
- 0 -> unfurnished
- 1 -> semifurnished
- 2 -> furnished

In [None]:
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [None]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3,2,2,New Property,850.0,0,0,semifurnished,Low,Low Floor
1,flat,sector 89,0.95,2,2,2,New Property,1226.0,1,0,semifurnished,Low,Mid Floor
2,flat,sohna road,0.32,2,2,1,New Property,1000.0,0,0,semifurnished,Low,High Floor
3,flat,sector 92,1.6,3,4,3+,Relatively New,1615.0,1,0,furnished,High,Mid Floor
4,flat,sector 102,0.48,2,2,1,Relatively New,582.0,0,1,semifurnished,High,Mid Floor


In [None]:
df['furnishing_type'].value_counts()

Unnamed: 0_level_0,count
furnishing_type,Unnamed: 1_level_1
semifurnished,2374
furnished,995
unfurnished,185


In [None]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3,2,2,New Property,850.0,0,0,semifurnished,Low,Low Floor
1,flat,sector 89,0.95,2,2,2,New Property,1226.0,1,0,semifurnished,Low,Mid Floor
2,flat,sohna road,0.32,2,2,1,New Property,1000.0,0,0,semifurnished,Low,High Floor
3,flat,sector 92,1.6,3,4,3+,Relatively New,1615.0,1,0,furnished,High,Mid Floor
4,flat,sector 102,0.48,2,2,1,Relatively New,582.0,0,1,semifurnished,High,Mid Floor


## Now, we will move forward to model selection

In [None]:
X = df.drop(columns=['price'])
y = df['price']

In [None]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### **Transformations for All Algorithms?**  
The preprocessing steps you’ve defined (`StandardScaler` for numerical and `OrdinalEncoder` for categorical) are **not universally optimal** for all algorithms. Here’s why:

#### **1. Numerical Features (`StandardScaler`)**  
- **Good for:**  
  - Linear models (Linear Regression, SVM, Logistic Regression) → Need scaled features.  
  - Neural Networks → Sensitive to input scale.  
  - Distance-based algorithms (KNN, K-Means) → Rely on Euclidean distance.  
- **Bad for:**  
  - **Tree-based models (Decision Trees, Random Forest, XGBoost)** → Scale-invariant; scaling is unnecessary.  

#### **2. Categorical Features (`OrdinalEncoder`)**  
- **Good for:**  
  - Tree-based models (they can handle ordinal relationships).  
  - If categories have a natural order (e.g., "Low", "Medium", "High").  
- **Bad for:**  
  - **Linear models/SVM** → Misinterpret ordinal numbers as weights (e.g., "High=2" ≠ 2×"Low=1").  
  - **Algorithms assuming numerical meaning** (e.g., Linear Regression).  

---

### **Better Alternatives for Categorical Encoding**  
#### **A. One-Hot Encoding (OHE)**  
- **What:** Creates binary columns (0/1) for each category.  
  - Example: "Color" → `Color_Red`, `Color_Blue`, `Color_Green`.  
- **Best for:**  
  - Linear models, SVM, Neural Networks (no ordinal assumption).  
  - Small cardinality features (<10 categories).  
- **Limitations:**  
  - **High-dimensional** if many categories (curse of dimensionality).  

#### **B. Label Encoding**  
- **What:** Assigns arbitrary numbers (e.g., "Cat"=0, "Dog"=1).  
- **Best for:**  
  - Tree-based models (if no ordinal relationship exists).  
- **Risks:**  
  - **Linear models may misinterpret** (e.g., "Dog" > "Cat" numerically).  

#### **C. Target Encoding (Mean Encoding)**  
- **What:** Replaces categories with the mean of the target variable.  
  - Example: "City" → Average house price per city.  
- **Best for:**  
  - High-cardinality features (e.g., ZIP codes).  
  - Tree-based models and gradient boosting.  
- **Risks:**  
  - Overfitting (use regularization or cross-validation).  

---

### **Recommended Preprocessing by Algorithm**  
| Algorithm          | Numerical Features | Categorical Features          |  
|--------------------|--------------------|-------------------------------|  
| **Linear Regression** | StandardScaler      | One-Hot Encoding              |  
| **SVM**             | StandardScaler      | One-Hot Encoding              |  
| **Decision Trees**  | No scaling needed   | Ordinal/Label Encoding        |  
| **Random Forest**   | No scaling needed   | Ordinal/Label/Target Encoding |  
| **Naive Bayes**     | StandardScaler      | One-Hot Encoding (for Count-Based NB) |  
| **KNN**             | StandardScaler      | One-Hot Encoding              |  

---

### **Key Takeaways**  
1. **StandardScaler is critical for linear models but useless for trees.**  
2. **OrdinalEncoder is risky for linear models** → Prefer **One-Hot Encoding**.  
3. **For tree-based models:**  
   - Numerical: No scaling needed.  
   - Categorical: Ordinal/Label/Target Encoding works.  
4. **For high-cardinality categories**, use **Target Encoding** (with caution).  


### Ordinal Encoding

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [None]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room' ]),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), ['sector'])
    ]
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(),scores.std()

(np.float64(0.6954394713938314), np.float64(0.040822273659683994))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(np.expm1(y_test),y_pred)  # it means error of 1.03 Cr

1.0330186113612987

In [None]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_output

[['linear_reg', np.float64(0.6954394713938314), 1.0330186113612987],
 ['svr', np.float64(0.7475768929999049), 0.8701950364950094],
 ['ridge', np.float64(0.6954418309288396), 1.032851893160082],
 ['LASSO', np.float64(0.053528318552219366), 1.5324844878586519],
 ['decision tree', np.float64(0.7603885042861159), 0.6347001284509171],
 ['random forest', np.float64(0.8664056015269779), 0.5388470280418204],
 ['extra trees', np.float64(0.8469303750104084), 0.5648194604944496],
 ['gradient boosting', np.float64(0.851851640558691), 0.6192065675798667],
 ['adaboost', np.float64(0.7232213872991343), 0.8709648473867174],
 ['mlp', np.float64(0.7940318815942973), 0.7500639609480948],
 ['xgboost', np.float64(0.8705314289700791), 0.5547232382886018]]

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.866406,0.538847
10,xgboost,0.870531,0.554723
6,extra trees,0.84693,0.564819
7,gradient boosting,0.851852,0.619207
4,decision tree,0.760389,0.6347
9,mlp,0.794032,0.750064
1,svr,0.747577,0.870195
8,adaboost,0.723221,0.870965
2,ridge,0.695442,1.032852
0,linear_reg,0.695439,1.033019


### OneHotEncoding

Here we have applied one hot encoding on 3 columns
- sector
- agePossession
- furnishing_type

These 3 columns have no orders, So we did that in 3rd step

In [None]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession',
                    'furnishing_type', 'luxury_category', 'floor_category']


onehot_cols = ['sector','agePossession','furnishing_type']  # Nominal columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('ordinal', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode),
        ('onehot', OneHotEncoder(
            drop='first',
            handle_unknown='ignore'
        ), onehot_cols)
    ],
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [None]:
scores.mean(),scores.std()

(np.float64(0.8547976891492045), np.float64(0.015619576844302695))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)



In [None]:
y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6502847506443611

In [None]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [None]:
model_output

[['linear_reg', np.float64(0.8547976891492045), 0.6502847506443611],
 ['svr', np.float64(0.762300914624124), 0.8362038364588023],
 ['ridge', np.float64(0.8549605180458653), 0.653598213607013],
 ['LASSO', np.float64(0.05352831855221939), 1.5324844878586519],
 ['decision tree', np.float64(0.8076132330168297), 0.6662862325714992],
 ['random forest', np.float64(0.8906993569770236), 0.49661685463477173],
 ['extra trees', np.float64(0.8933719839996626), 0.4698728548952527],
 ['gradient boosting', np.float64(0.87509525344305), 0.5627395641897293],
 ['adaboost', np.float64(0.7569605991423073), 0.8463014810680926],
 ['mlp', np.float64(0.8748431646463845), 0.5536608562239778],
 ['xgboost', np.float64(0.8955370337704884), 0.47707205072317777]]

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.893372,0.469873
10,xgboost,0.895537,0.477072
5,random forest,0.890699,0.496617
9,mlp,0.874843,0.553661
7,gradient boosting,0.875095,0.56274
0,linear_reg,0.854798,0.650285
2,ridge,0.854961,0.653598
4,decision tree,0.807613,0.666286
1,svr,0.762301,0.836204
8,adaboost,0.756961,0.846301


*  Here again tree based models are performing better. But there is a significant improvement in Linear models
* MAE have reduced here significantly

## OneHotEncoding With PCA
 We have to use PCA for dimension reduction, if the columns having high cardinalities(sector) are present. After one hot encoding, the number of columns will be increased and to reduce the dimensionality, we have to use PCA (here we are keeping features which explain 95% variance)

In [None]:
# Creating a column transformer for preprocessing
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession',
                    'furnishing_type', 'luxury_category', 'floor_category']


onehot_cols = ['sector','agePossession']  # Nominal columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('ordinal', OrdinalEncoder(
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ), columns_to_encode),
        ('onehot', OneHotEncoder(
            drop='first',
            handle_unknown='ignore'
        ), onehot_cols)
    ],
    remainder='passthrough'
)

This pipeline converts sparse matrix data to dense format before applying PCA with a full singular value decomposition (SVD) solver. Let me break down each step:

### 1. **Preprocessor (`ColumnTransformer`)**
```python
('preprocessor', preprocessor)
```
- Applies your defined preprocessing steps
- Likely outputs a sparse matrix (from `OneHotEncoder`)
- Contains:
  - Standard scaling for numerical features
  - Ordinal encoding for ordered categoricals
  - One-hot encoding for nominal categoricals

### 2. **Sparse-to-Dense Conversion**
```python
('to_dense', FunctionTransformer(lambda x: x.toarray()))
```
- `FunctionTransformer` applies the given function to the data
- `.toarray()` converts sparse matrices to dense NumPy arrays
- **Why needed?** PCA's `'full'` solver doesn't work with sparse matrices

### 3. **PCA Dimensionality Reduction**
```python
('pca', PCA(n_components=0.95, svd_solver='full'))
```
- Keeps enough components to retain 95% variance
- Uses LAPACK's full SVD solver (most accurate but memory-intensive)
- Now works because data is dense

### 4. **Linear Regression**
```python
('regressor', LinearRegression())
```
- Standard linear model
- Works with PCA-reduced features

### Key Considerations:

**Memory Warning**:
- Converting sparse → dense may use much more memory
- If your one-hot encoded data has many categories, the dense matrix could be huge

**Better Alternatives**:
1. For sparse data, use `TruncatedSVD` instead of PCA:
   ```python
   ('svd', TruncatedSVD(n_components=100))
   ```

2. Or keep sparse format with ARPACK:
   ```python
   ('pca', PCA(n_components=0.95, svd_solver='arpack'))
   ```

3. Limit one-hot encoding cardinality first:
   ```python
   OneHotEncoder(max_categories=50)
   ```

**When This Solution Works Best**:
- Small-to-medium datasets
- When you need PCA's exact variance calculations
- When downstream steps require dense inputs

Would you like me to suggest a version optimized for your specific data size?

FunctionTransformer

In [None]:
# Creating a pipeline
from sklearn.preprocessing import FunctionTransformer
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('to_dense', FunctionTransformer(lambda x: x.toarray())),
    ('pca', PCA(n_components=0.85, svd_solver='full')),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [None]:
scores.mean()

np.float64(0.05587597524494605)

In [None]:
scores.std()

np.float64(0.019221412382936627)

### using TruncatedSVD

In [None]:
from sklearn.decomposition import TruncatedSVD

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),  # Designed for sparse data
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [None]:
scores.mean()

np.float64(0.841567019333508)

In [None]:
scores.std()

np.float64(0.019172452096757546)

In [None]:
def scorer(model_name, model):

    output = []

    output.append(model_name)

    pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svd', TruncatedSVD(n_components=100)),  # Designed for sparse data
    ('regressor', model)
    ])

    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

    output.append(scores.mean())

    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

    pipeline.fit(X_train,y_train)

    y_pred = pipeline.predict(X_test)

    y_pred = np.expm1(y_pred)

    output.append(mean_absolute_error(np.expm1(y_test),y_pred))

    return output


In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.866917,0.530499
9,mlp,0.859176,0.557147
10,xgboost,0.854959,0.581291
5,random forest,0.845518,0.592555
7,gradient boosting,0.844458,0.648455
0,linear_reg,0.841139,0.670262
2,ridge,0.841662,0.67607
4,decision tree,0.655408,0.806201
1,svr,0.762084,0.836957
8,adaboost,0.708375,0.850442


## Target Encoder
### This is used for columns having high cardinality like sector column

### **Target Encoding (Mean Encoding) - Definition**
**Target Encoding** replaces categorical values with the **mean (or other statistic) of the target variable** for each category.  
- It is a type of **supervised encoding** (uses target variable information).  
- Helps in capturing **relationships between categorical features and the target**.

---

### **Where to Use Target Encoding?**
1. **High-Cardinality Categorical Features** (too many unique categories, e.g., `sector1`, `sector2`, ... `sector100`).  
   - One-hot encoding would create too many columns → **Target encoding compresses into a single numeric column**.  
2. **When you want to preserve feature importance** while reducing dimensionality.  
3. **Tree-based models (Random Forest, XGBoost, CatBoost)** often work well with target-encoded features.  

⚠️ **Caution:**  
- Can cause **overfitting** if not regularized (e.g., using smoothing or cross-validation).  
- Not ideal for **linear models** (can lead to data leakage).  

---

### **Example (Manual Calculation)**
Given data:  

| **Sector**  | **Price** |
|-------------|----------|
| sector1     | 2        |
| sector49    | 3        |
| sector1     | 4        |
| sector102   | 1        |
| sector49    | 6        |

**Step 1:** Compute **mean price per sector**  
- `sector1` → `(2 + 4)/2 = 3`  
- `sector49` → `(3 + 6)/2 = 4.5`  
- `sector102` → `1` (only one sample)  

**Step 2:** Replace categories with their mean price:  

| **Sector (Original)** | **Target-Encoded Sector** |
|----------------------|--------------------------|
| sector1              | 3.0                      |
| sector49             | 4.5                      |
| sector1              | 3.0                      |
| sector102            | 1.0                      |
| sector49             | 4.5                      |

---

### **Python Implementation**
#### **1. Using `pandas` (Simple Way)**
```python
import pandas as pd

df = pd.DataFrame({
    'Sector': ['sector1', 'sector49', 'sector1', 'sector102', 'sector49'],
    'Price': [2, 3, 4, 1, 6]
})

# Target Encoding: Mean of Price per Sector
target_encoding = df.groupby('Sector')['Price'].mean().to_dict()
df['Sector_Encoded'] = df['Sector'].map(target_encoding)

print(df)
```
**Output:**
```
     Sector  Price  Sector_Encoded
0   sector1      2             3.0
1  sector49      3             4.5
2   sector1      4             3.0
3  sector102     1             1.0
4  sector49      6             4.5
```

#### **2. Using `category_encoders` (Better for ML)**
```python
from category_encoders import TargetEncoder
import pandas as pd

df = pd.DataFrame({
    'Sector': ['sector1', 'sector49', 'sector1', 'sector102', 'sector49'],
    'Price': [2, 3, 4, 1, 6]
})

encoder = TargetEncoder()
df['Sector_Encoded'] = encoder.fit_transform(df['Sector'], df['Price'])

print(df)
```
**Output:** (Same as above, but more robust for ML pipelines.)

---

### **When NOT to Use Target Encoding?**
❌ **Small datasets** (high risk of overfitting).  
❌ **Without cross-validation** (leads to data leakage).  
❌ **For linear regression** (can distort relationships).  

✅ **Best for:**  
✔️ **Tree-based models** (XGBoost, LightGBM, CatBoost).  
✔️ **High-cardinality categorical features** (e.g., ZIP codes, product IDs).  

## **NOTE**: There is high chance of data leakage, so first do train test split then apply target encoding (in our case we are using cross validation, so intenally managed)

In [None]:
!pip install category_encoders



In [None]:
import category_encoders as ce
# Creating a column transformer for preprocessing
num_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession',
                    'furnishing_type', 'luxury_category', 'floor_category']


onehot_cols = ['agePossession']  # Nominal columns
target_cols = ['sector']

preprocessor  = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
    ('onehot', OneHotEncoder(drop='first',handle_unknown='ignore'), onehot_cols),
    ('target', ce.TargetEncoder(), target_cols)
], remainder = 'passthrough')

In [None]:
# creating pipelines
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(), scores.std()

(np.float64(0.82686288588226), np.float64(0.017412161594216293))

In [None]:
def scorer(model_name, model):
  output = []
  output.append(model_name)

  pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('regressor', model)
  ])
  #K-fold cross-validation
  k_fold = KFold(n_splits=10, shuffle= True, random_state=42)
  scores =cross_val_score(pipeline, X, y_transformed, cv=k_fold, scoring='r2')

  output.append(scores.mean())
  X_train, X_test, y_train, y_test  = train_test_split(X, y_transformed, test_size= 0.2, random_state=42)

  pipeline.fit(X_train, y_train)
  y_pred = pipeline.predict(X_test)
  y_pred = np.expm1(y_pred)

  output.append(mean_absolute_error(np.expm1(y_test), y_pred))

  return output

In [None]:
model_dict= {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree' : DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees' : ExtraTreesRegressor(),
    'gradient boosting' : GradientBoostingRegressor(),
    'adaboost' : AdaBoostRegressor(),
    'mlp' : MLPRegressor(),
    'xgboost' : XGBRegressor()
}

In [None]:
model_output = []
for model_name, model in model_dict.items():
  model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns = ['name', 'r2', 'mae'])

In [None]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.901384,0.451315
6,extra trees,0.901973,0.455168
10,xgboost,0.90346,0.471034
7,gradient boosting,0.888282,0.520395
4,decision tree,0.831606,0.594299
9,mlp,0.849495,0.596098
8,adaboost,0.815773,0.698124
0,linear_reg,0.826863,0.717066
2,ridge,0.826878,0.717655
1,svr,0.775097,0.821906


### **Calculating R² and MAE Without Train-Test Split**  

Yes, you can compute **R² and MAE without using `train_test_split`** by using **K-Fold Cross-Validation (CV)**.  

#### **How Does K-Fold CV Work?**  
- The dataset is split into **K folds** (e.g., 10 folds).  
- For each fold:  
  - **One fold** is used as the **test set**.  
  - The remaining **K-1 folds** are used as the **training set**.  
  - The model is trained on the training set and predicts on the test set.  
  - **R² and MAE** are computed for that fold's predictions.  
- Finally, the **average R² and MAE** across all folds are reported.  

#### **Do Both Approaches Use the Same Data?**  
- **Yes**, but differently:  
  - **`train_test_split`** → Uses **one fixed split** (e.g., 80% train, 20% test).  
  - **K-Fold CV** → Uses **every data point in test exactly once** across K different splits.  
- **Key Difference:**  
  - K-Fold CV gives a **more reliable estimate** because it averages performance across multiple test sets.  
  - `train_test_split` depends on a **single random split**, which may not be representative.  

#### **Which is Better?**  
- **K-Fold CV is preferred** because:  
  - It reduces **randomness** (no dependence on a single split).  
  - It uses **all data** for both training and testing (no wasted samples).  
- **`train_test_split` is useful** when:  
  - You need a **fixed holdout set** (e.g., for final model evaluation).  
  - You want **faster computation** (K-Fold is slower).  

### **Final Answer**  
- **Yes**, you can compute **R² and MAE without `train_test_split`** using **K-Fold CV**.  
- **Both methods use the same data**, but K-Fold CV evaluates performance more robustly by averaging across multiple splits.  
- **K-Fold CV is generally more trustworthy** for model evaluation.  


In [None]:
# MAE and R2 score Without using train test split

def scorer(model_name, model):
    output = []
    output.append(model_name)

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

    # K-fold cross-validation for R2
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    r2_scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    output.append(r2_scores.mean())

    # K-fold cross-validation for MAE
    mae_scores = cross_val_score(pipeline, X, y_transformed, cv=kfold,
                                scoring='neg_mean_absolute_error')
    # Convert back to positive MAE and take mean
    mae_mean = -mae_scores.mean()
    # Transform the MAE back to original scale
    output.append(np.expm1(mae_mean))

    return output

model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost': XGBRegressor()
}

model_output = []
for model_name, model in model_dict.items():
    model_output.append(scorer(model_name, model))

model_df = pd.DataFrame(model_output, columns=['name', 'r2', 'mae'])


In [None]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.901992,0.115788
5,random forest,0.90127,0.116602
10,xgboost,0.90346,0.119256
7,gradient boosting,0.888615,0.137476
4,decision tree,0.828804,0.149813
9,mlp,0.848301,0.175643
2,ridge,0.826878,0.183859
0,linear_reg,0.826863,0.183865
8,adaboost,0.818724,0.200006
1,svr,0.775097,0.213877


# **Model Performance Comparison Across Encoding Techniques**

| **Encoding Type**               | **Model**       | **R² Score** | **MAE**  |
|---------------------------------|----------------|-------------|----------|
| Ordinal Encoding                | Random Forest  | 0.866796    | 0.540972 |
| One-Hot Encoding                | Extra Trees    | 0.885351    | 0.481116 |
| One-Hot Encoding + PCA          | MLP            | 0.866482    | 0.529623 |
| **Target Encoding**             | **Random Forest** | **0.900831** | **0.116549** |

### **Conclusion: Why Target Encoding?**
- **Highest R² Score (0.9008)** → Best predictive performance.  
- **Lowest MAE (0.1165)** → Smallest prediction errors.  
- **Best for high-cardinality categorical data** (efficiently captures relationships without exploding dimensions).  

**Final Choice:** ✅ **Target Encoding** with **Random Forest** is selected for deployment due to superior accuracy and minimal error.

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'regressor__n_estimators' : [50, 100, 200, 300],
    'regressor__max_depth' : [None, 10, 20, 30],
    'regressor__min_samples_split' : [0.1, 0.25, 0.5, 1.0],
    'regressor__min_samples_leaf' : [0.1, 0.25, 0.5, 1.0],
    'regressor__max_features' : ['auto', 'sqrt', 'log2']
}

In [None]:
# Creating a column transformer for preprocessing
num_cols = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession',
                    'furnishing_type', 'luxury_category', 'floor_category']


onehot_cols = ['agePossession']  # Nominal columns
target_cols = ['sector']

preprocessor  = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1), columns_to_encode),
    ('onehot', OneHotEncoder(drop='first',handle_unknown='ignore'), onehot_cols),
    ('target', ce.TargetEncoder(), target_cols)
], remainder = 'passthrough')

### Random Forest

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
kfold = KFold(n_splits = 10, shuffle= True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


3840 fits failed out of a total of 7680.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2560 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 138

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

{'regressor__max_depth': 30,
 'regressor__max_features': 'sqrt',
 'regressor__min_samples_leaf': 0.1,
 'regressor__min_samples_split': 0.1,
 'regressor__n_estimators': 100}

In [None]:
search.best_score_

np.float64(0.6600321973386164)

In [None]:
final_pipe.fit(X,y_transformed)

### XGBoost

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

In [None]:
kfold = KFold(n_splits = 10, shuffle= True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'auto',
 'regressor__min_samples_leaf': 0.1,
 'regressor__min_samples_split': 0.1,
 'regressor__n_estimators': 100}

In [None]:
search.best_score_

np.float64(0.9034603668949108)

In [None]:
final_pipe.fit(X,y_transformed)

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



In [None]:
import pickle

with open('pipeline_xgb.pkl', 'wb') as file:
    pickle.dump(final_pipe, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

### trying XGBOOST with advance hyperparameter tuning

The key differences between your original **GridSearchCV** approach and the new **Hyperopt Bayesian optimization** implementation are:

---

### **a. Search Strategy**
| **Aspect**               | **GridSearchCV**                          | **Hyperopt (Bayesian Optimization)**       |
|--------------------------|------------------------------------------|------------------------------------------|
| **Method**               | Exhaustive grid search                   | Sequential model-based optimization (TPE) |
| **Efficiency**           | Tests all combinations (slow)            | Focuses on promising params (faster)     |
| **Parameter Handling**   | Discrete values only                     | Supports continuous ranges (e.g., `0.01-0.3` for learning rate) |
| **Parallelization**      | Native (`n_jobs=-1`)                     | Requires `SparkTrials` for parallel runs |

---

### **b. Parameter Space**
#### **GridSearchCV (Original)**
```python
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    # Only discrete values allowed
}
```

#### **Hyperopt (New)**
```python
space = {
    'regressor__n_estimators': hp.choice('n_estimators', [50, 100, 200, 300]),
    'regressor__max_depth': hp.choice('max_depth', [None, 10, 20, 30]),
    'regressor__learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),  # Continuous range
    # Additional XGBoost-specific params
}
```
- **Added critical XGBoost parameters** like `learning_rate`, `subsample`, and regularization terms (`reg_alpha`, `reg_lambda`).
- **Continuous ranges** for finer tuning (e.g., `learning_rate: 0.01-0.3`).

---

### **c. Optimization Process**
#### **GridSearchCV**
- Evaluates **every combination** in `param_grid`.
- No memory of past evaluations.

#### **Hyperopt**
- Uses **Tree-structured Parzen Estimator (TPE)** to model promising regions.
- Iteratively focuses on areas with better performance.
- **Key steps:**
  ```python
  best = fmin(
      fn=objective,  # Minimizes -R²
      space=space,
      algo=tpe.suggest,
      max_evals=50,  # Stops after 50 iterations
      trials=trials,  # Tracks progress
      rstate=np.random.default_rng(42)  # Fixes randomness
  )
  ```

---

### **d. Output Differences**
| **Metric**       | **GridSearchCV**              | **Hyperopt**                     |
|------------------|-------------------------------|----------------------------------|
| **Best Params**  | From fixed grid               | Sampled from continuous ranges   |
| **Speed**        | Slower (tests all combos)     | Faster (50-100 evals typically)  |
| **Flexibility**  | Limited to predefined values  | Can explore unbounded ranges     |

---

### **Which One to Choose?**
- **Use GridSearchCV if:**
  - You have a small parameter space.
  - You need reproducibility with exact discrete values.
  
- **Use Hyperopt if:**
  - You want **faster convergence** to good parameters.
  - You need to tune **continuous hyperparameters** (e.g., `learning_rate`).
  - You’re using XGBoost/LightGBM (Bayesian optimization works exceptionally well for tree-based models).

---

- **Note:** Hyperopt may find non-intuitive combinations that perform better than grid search!



In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, space_eval
from sklearn.model_selection import cross_val_score
import numpy as np

# Define the search space for XGBoost
space = {
    'regressor__n_estimators': hp.choice('n_estimators', [50, 100, 200, 300]),
    'regressor__max_depth': hp.choice('max_depth', [None, 10, 20, 30]),
    'regressor__learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'regressor__subsample': hp.uniform('subsample', 0.6, 1.0),
    'regressor__colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'regressor__min_child_weight': hp.choice('min_child_weight', [1, 3, 5]),
    'regressor__gamma': hp.uniform('gamma', 0, 0.5),
    'regressor__reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'regressor__reg_lambda': hp.uniform('reg_lambda', 0, 1)
}

# Objective function to minimize (negative R²)
def objective(params):
    pipeline.set_params(**params)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2', n_jobs=-1)
    return {'loss': -np.mean(scores), 'status': STATUS_OK}

# Initialize pipeline (same preprocessing as before)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42))
])

# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=50,  # Number of iterations
    trials=trials,
    # Use randint instead of integers
    rstate=np.random.default_rng(42)
)

# Get best parameters
best_params = space_eval(space, best)
print("Best parameters:", best_params)

# Train final model with best params
final_pipe = pipeline.set_params(**best_params)
final_pipe.fit(X, y_transformed)

100%|██████████| 50/50 [01:20<00:00,  1.60s/trial, best loss: -0.9071473655516137]
Best parameters: {'regressor__colsample_bytree': 0.6048160948946112, 'regressor__gamma': 0.0037461518971623545, 'regressor__learning_rate': 0.03777669747358033, 'regressor__max_depth': 10, 'regressor__min_child_weight': 3, 'regressor__n_estimators': 200, 'regressor__reg_alpha': 0.7453369866793452, 'regressor__reg_lambda': 0.3529414595332569, 'regressor__subsample': 0.9997677088550361}


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

# Make predictions (remember to transform y_test back if needed)
y_pred = final_pipe.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")


R² Score: 0.9711
MAE: 0.0699


### Exporting the model

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ],
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3,2,2,New Property,850.0,0,0,semifurnished,Low,Low Floor
1,flat,sector 89,2,2,2,New Property,1226.0,1,0,semifurnished,Low,Mid Floor
2,flat,sohna road,2,2,1,New Property,1000.0,0,0,semifurnished,Low,High Floor
3,flat,sector 92,3,4,3+,Relatively New,1615.0,1,0,furnished,High,Mid Floor
4,flat,sector 102,2,2,1,Relatively New,582.0,0,1,semifurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,2,2,1,Relatively New,532.0,0,0,semifurnished,Medium,Mid Floor
3550,house,sector 109,5,5,3+,Relatively New,6228.0,1,1,semifurnished,High,Low Floor
3551,flat,sector 2,1,1,1,Moderately Old,665.0,0,0,furnished,Medium,Mid Floor
3552,house,sector 43,5,6,3,Moderately Old,5490.0,1,1,semifurnished,Medium,Mid Floor


### Trying out the predictions

In [None]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [None]:
X.iloc[0].values

array(['flat', 'sector 36', np.int64(3), np.int64(2), '2', 'New Property',
       np.float64(850.0), np.int64(0), np.int64(0), 'semifurnished',
       'Low', 'Low Floor'], dtype=object)

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [None]:
np.expm1(pipeline.predict(one_df))

array([2.69685284])

In [None]:
X.dtypes

Unnamed: 0,0
property_type,object
sector,object
bedRoom,int64
bathroom,int64
balcony,object
agePossession,object
built_up_area,float64
servant room,int64
store room,int64
furnishing_type,object


In [None]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 10a',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 17a',
 'sector 17b',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 3 phase 2',
 'sector 3 phase 3 extension',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 36a',
 'sector 37',
 'sector 37c',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'se

In [None]:
df

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3,2,2,New Property,850.0,0,0,semifurnished,Low,Low Floor
1,flat,sector 89,0.95,2,2,2,New Property,1226.0,1,0,semifurnished,Low,Mid Floor
2,flat,sohna road,0.32,2,2,1,New Property,1000.0,0,0,semifurnished,Low,High Floor
3,flat,sector 92,1.60,3,4,3+,Relatively New,1615.0,1,0,furnished,High,Mid Floor
4,flat,sector 102,0.48,2,2,1,Relatively New,582.0,0,1,semifurnished,High,Mid Floor
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,flat,sector 84,0.37,2,2,1,Relatively New,532.0,0,0,semifurnished,Medium,Mid Floor
3550,house,sector 109,6.00,5,5,3+,Relatively New,6228.0,1,1,semifurnished,High,Low Floor
3551,flat,sector 2,0.60,1,1,1,Moderately Old,665.0,0,0,furnished,Medium,Mid Floor
3552,house,sector 43,15.50,5,6,3,Moderately Old,5490.0,1,1,semifurnished,Medium,Mid Floor
