In [6]:
import numpy as np
import pandas as pd
import ast 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer,StandardScaler
from sklearn.metrics import r2_score

In [7]:
df=pd.read_csv('house.csv')
df['log_price'] = np.log1p(df['price'])

In [8]:
# ======Handling the additionalRoom column=======
df['additionalRoom'] = df['additionalRoom'].str.split(',')
mlb = MultiLabelBinarizer()
one_hot = pd.DataFrame(mlb.fit_transform(df['additionalRoom']), columns=mlb.classes_)
df = pd.concat([df.drop(columns='additionalRoom'), one_hot], axis=1)

# ======Handling the Rating column=======
def extract_ratings(rating_str):
    try:
        items = ast.literal_eval(rating_str) 
        rating_dict = {}
        for item in items:
            for category in ['Environment', 'Lifestyle', 'Connectivity', 'Safety']:
                if item.startswith(category):
                    num = int(item.replace(category, '').split()[0])
                    rating_dict[category] = num
        return pd.Series(rating_dict)
    except:
        return pd.Series({cat: 0 for cat in ['Environment', 'Lifestyle', 'Connectivity', 'Safety']})
    
df_ratings = df['rating'].apply(extract_ratings)
df = pd.concat([df, df_ratings], axis=1)
df.drop('rating', axis=1, inplace=True)

# ======Handling the Balcony column=======
df['balcony'] = df['balcony'].replace('3+', 4).astype(int)

# ======Handling the furnishDetails column=======
def safe_parse_to_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except:
            return []
    return []

def parse_items(row):
    result = {}
    if not isinstance(row, list): 
        return result
    for item in row:
        if isinstance(item, str):
            if item.startswith('No '):
                key = item.replace('No ', '')
                result[key] = 0
            else:
                parts = item.split(' ', 1)
                if len(parts) == 2:
                    count, key = parts
                    result[key] = int(count)
    return result

df['furnishDetails'] = df['furnishDetails'].apply(safe_parse_to_list)
parsed_data = df['furnishDetails'].apply(parse_items)
df_counts = pd.DataFrame(parsed_data.tolist()).fillna(0).astype(int)
df = pd.concat([df, df_counts], axis=1)
df.drop('furnishDetails', axis=1, inplace=True)

#====== Drop Irrelevent Features======
df.drop(['property_name','price_per_sqft','property_type','areaWithType','address','nearbyLocations','description','features','society'],axis=1,inplace=True)

#====== Encoding ======
df=pd.get_dummies(df, columns=['facing','agePossession'])
df.drop(['agePossession_undefined','not available'],axis=1,inplace=True)

#====== Fil NaN value with mean======
for col in ['area','floorNum','log_price','price']:
    df[col] = df[col].fillna(df[col].mean())


In [9]:
df.to_csv('Encoded_House.csv', index=False)