# Imports

In [226]:
import pandas as pd
import re
import joblib

In [2]:
data = pd.read_csv('../../Data/magicbricks_lr.csv')

# Preprocessing

In [12]:
# Columns to drop

def get_column_status():
    """Make a df of all columns and their num nan values, dtype, num unique values"""
    cols = data.columns.tolist()
    col_status = pd.DataFrame(columns=['column', 'num_nan', 'dtype', 'num_unique'])
    for col in cols:
        col_status.loc[len(col_status)] = [col, data[col].isna().sum(), data[col].dtype, data[col].nunique()]
    return col_status

In [18]:
status = get_column_status()
status.sort_values(by='num_nan', ascending=False)

Unnamed: 0,column,num_nan,dtype,num_unique
13,Unnamed: 13,319,float64,0
35,Car Parking,318,object,1
34,Possession by,317,object,2
26,Floors allowed for construction,305,float64,5
33,RERA ID,305,object,12
42,Bathroom,304,float64,1
29,Loan Offered by,299,object,11
44,Lifts,292,object,4
43,Study Room,280,float64,1
40,Servant Room,266,float64,1


In [70]:
# For rows where Car parking has missing values check if Car Parking has a value and if so, replace the missing value with the value
missing_indices = data[data['Car parking'].isna()].index
for index in missing_indices:
    if not pd.isna(data.loc[index, 'Car Parking']):
        data.loc[index, 'Car parking'] = data.loc[index, 'Car Parking']

In [176]:
cols_to_keep = ['Bedrooms', 'Car parking', 'Furnishing', 'Bathrooms', 'Location_Score', 'Floor', 'Status', 'Price Breakup']
filtered_data = data[cols_to_keep]

In [213]:
def feature_engineering(data: pd.DataFrame) -> pd.DataFrame:

    def get_price(price: str):
        """
        '₹ 1.72 Cr ₹ 12,03,923 Stamp Duty, Registration Charges ₹ 2,800 Monthly\nSee Other Charges' -> ₹ 1.72 Cr
        '₹ 1.58 Cr ₹ 11,05,986 Stamp Duty, Registration Charges ₹ 2,500 Monthly\nSee Other Charges' -> ₹ 1.58 Cr
        '₹ 1.27 Cr ₹ 8,88,930 Stamp Duty, Registration Charges ₹ 2,850 Monthly\nSee Other Charges' -> ₹ 1.27 Cr
        """
        num = re.search(r'[.\d]+', price).group()
        suffix = re.search(r'Lac|Cr', price).group()
        if suffix == 'Lac':
            num = float(num)
        elif suffix == 'Cr':
            num = float(num) * 100
        return int(num)


    data = data.copy()
    # Bedrooms
    digits_regex = re.compile(r'[\-\d]+')
    data.loc[:, 'Bedrooms'] = data['Bedrooms'].apply(lambda x: int(digits_regex.search(x).group()) if type(x) == str else x)
    data.loc[:, 'Bedrooms'] = data['Bedrooms'].fillna(data['Bedrooms'].mode()[0])
    data.loc[:, 'Bedrooms'] = data['Bedrooms'].astype(int)

    # Car parking
    # Find all numbers and add
    data.loc[:, 'Car parking'] = data['Car parking'].apply(lambda x: int(sum(map(int, digits_regex.findall(x)))) if type(x) == str and digits_regex.search(x) else x)
    data.loc[:, 'Car parking'] = data['Car parking'].apply(lambda x: x.replace('None', '0') if 'None' in str(x) else x)
    data.loc[:, 'Car parking'] = data['Car parking'].fillna(data['Car parking'].mode()[0])
    data.loc[:, 'Car parking'] = data['Car parking'].astype(int)

    # Furnishing
    data.loc[:, 'Furnishing'] = data['Furnishing'].fillna(data['Furnishing'].mode()[0])
    data.loc[:, 'Furnishing'] = data['Furnishing'].astype('category')

    # Bathrooms
    data.loc[:, 'Bathrooms'] = data['Bathrooms'].fillna(data['Bathrooms'].mode()[0])
    data.loc[:, 'Bathrooms'] = data['Bathrooms'].astype(int)
    
    # Floor
    data.loc[:, 'Floor'] = data['Floor'].apply(lambda x: x.replace('Ground', '0') if type(x) == str else x)
    data.loc[:, 'Floor'] = data['Floor'].apply(lambda x: x.replace('Upper Basement', '-1') if type(x) == str else x)
    data.loc[:, 'Total Floors'] = data['Floor'].apply(lambda x: int(digits_regex.findall(x)[1]) if type(x) == str else x)
    data.loc[:, 'Floor'] = data['Floor'].apply(lambda x: int(digits_regex.findall(x)[0]) if type(x) == str else x)
    data.loc[: , 'Total Floors'] = data['Floor'].fillna(data['Total Floors'].mode()[0])
    data.loc[:, 'Floor'] = data['Floor'].fillna(data['Floor'].mode()[0])
    data.loc[:, 'Total Floors'] = data['Total Floors'].astype(int)
    data.loc[:, 'Floor'] = data['Floor'].astype(int)

    # Location_Score
    data.dropna(subset=['Location_Score'], inplace=True)

    # Status
    data.loc[:, 'Status'] = data['Status'].fillna(data['Status'].mode()[0])
    data.loc[:, 'Status'] = data['Status'].astype('category')

    # Price Breakup
    data.loc[:, 'Price Breakup'] = data['Price Breakup'].apply(get_price)


    return data

In [214]:
modified_data = feature_engineering(filtered_data)

In [216]:
def encode(data: pd.DataFrame) -> pd.DataFrame:
    data = data.copy()
    cat_cols = data.select_dtypes(include=['category']).columns.tolist()
    for col in cat_cols:
        data.loc[:, col] = data[col].cat.codes

    return data

In [217]:
encoded_data = encode(modified_data)

# Training

In [219]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [220]:
X, y = encoded_data.drop(['Price Breakup'], axis=1), encoded_data['Price Breakup']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [221]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

In [222]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [223]:
# Score
print(f'MSE: {mean_squared_error(y_valid, lr.predict(X_valid))}')
print(f'Score: {lr.score(X_valid, y_valid)}')

MSE: 10580.328629161027
Score: 0.27932065181693977


In [224]:
for y_true, y_predict in zip(y_valid, lr.predict(X_valid)):
    print(f'{y_true} -> {y_predict}')

158 -> 253.36612202099434
610 -> 338.92573531105006
23 -> 57.87307621525014
146 -> 308.3506275819898
58 -> 101.48784815428576
60 -> 275.5179758032997
26 -> 57.87307621525014
47 -> 71.36899066044612
159 -> 128.05346787188984
150 -> 74.61366549119066
195 -> 245.01890978010954
121 -> 104.04955902739601
179 -> 288.32511877365414
110 -> 52.97777958664
27 -> 106.27590881404751
41 -> 105.89570028469689
375 -> 308.50271099373003
150 -> 108.94485565600615
350 -> 312.9417573871194
265 -> 308.19854417024965
100 -> 208.44153044134282
95 -> 308.3506275819898
125 -> 60.05457855900977
29 -> 119.46765643576305
82 -> 90.47760287632985
107 -> 130.48837579135056
46 -> 90.17343605284941
78 -> 77.45865885170122
90 -> 93.11535545531024
225 -> 306.7760474899943
70 -> 126.0240489398704
36 -> 61.62915865100524
125 -> 101.7920149777662
167 -> 144.9461405595706
390 -> 308.2745858761198
40 -> 108.64068883252565
75 -> 275.4419340974297
210 -> 545.5603896211437
80 -> 57.56890939176964
350 -> 301.9478361724997
180 -

In [228]:
# Save Model
joblib.dump(lr, '../../Code/Modelling/Models/lr.model')

['../../Code/Modelling/Models/lr.model']