In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV

df = pd.read_csv('/content/land_value_data.csv')

df.drop(columns=['DOC_NUM', 'Village', 'Unnamed: 8', 'old sub_division_number',
                 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14'], inplace=True)

df.drop(columns=['document_amount', 'extent'], inplace=True)

df.info()

df.loc[df['fair_value'] == 'Null', 'fair_value'] = np.nan

df.loc[df['land_type'] == 'Null', 'land_type'] = np.nan

df.isnull().sum()

df.shape

df['land_type'].value_counts()

df.loc[df['land_type'].isin(['\tResidential Plot with Corp./Mun./Panch. road access',
                             'Residential Plot with Corp./Mun./Panch. road access']), 'land_type'] = 'Residential Plot with Corp./Mun./Panch. road access'

df.loc[df['land_type'].isin(['Residential Plot without vehicular access\t',
                             '\tResidential Plot without vehicular access']), 'land_type'] = 'Residential Plot without vehicular access'

df.loc[df['land_type'].isin(['\tWet land', 'Wet land\t', 'Wet Land']), 'land_type'] = 'Wet land'

df.loc[df['land_type'].isin(['Garden Land without road access\t']), 'land_type'] = 'Garden Land without road access'

df.loc[df['land_type'] == '\tResidential Plot with NH/PWD road access', 'land_type'] = 'Residential Plot with NH/PWD road access'

df.loc[df['land_type'] == '\tCommercially important Plot', 'land_type'] = 'Commercially important Plot'

df.loc[df['land_type'] == '\tWater logged land', 'land_type'] = 'Water logged land'

df['land_type'].value_counts()

# Data type conversion

df['fair_value'] = df['fair_value'].astype('float64')

# Missing value imputation

df['fair_value'].fillna(df['fair_value'].mean(), inplace=True)

df['land_type'].fillna('Residential Plot without vehicular access', inplace=True)

df.describe()

# Encoding categorical variable

df = pd.get_dummies(df, drop_first=True)

x = df.drop(columns=['market_value'])
y = df['market_value']

# Outlier detection and removal

q1 = np.percentile(df['fair_value'], 25)
q3 = np.percentile(df['fair_value'], 75)
iqr = q3 - q1

df['fair_value'] = np.where(df['fair_value'] > q3 + 1.5 * iqr,
                            q3 + 1.5 * iqr,
                            np.where(df['fair_value'] < q1 - 1.5 * iqr, q1 - 1.5 * iqr, df['fair_value']))


# Train test split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model2 = RandomForestRegressor()

# Decision Tree fit

model2.fit(x_train, y_train)

pred_test = model2.predict(x_test)
pred_train = model2.predict(x_train)

print(f'{r2_score(y_test, pred_test)} , {r2_score(y_train, pred_train)}')

params = {
    'n_estimators': [20, 30, 40, 50, 60, 100, 120, 200, 500],
    'criterion': ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    'max_depth': [val for val in range(0, 10)],
    'max_samples': [0.25, 0.5, 0.75],
    'max_features': ["sqrt", "log2", None]
}

grid = RandomizedSearchCV(model2, params, n_jobs=-1)

grid.fit(x_train, y_train)

grid.best_score_

grid.best_params_


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   survey_number  471 non-null    int64  
 1   fair_value     471 non-null    object 
 2   land_type      471 non-null    object 
 3   market_value   471 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 14.8+ KB
-0.03587228095340711 , 0.7393872445388711


5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

{'n_estimators': 200,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': 1,
 'criterion': 'friedman_mse'}

In [2]:
import pickle

# Save the model
with open('random_forest_model.pkl', 'wb') as file:
    pickle.dump(model2, file)

In [3]:
# Load the model
with open('random_forest_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [None]:
survey_number = int(input("Enter survey number: "))
land_type = input("Enter land type: ")
fair_value = float(input("Enter fair value: "))

# Define the decoding method
def decode_land_type(land_type):
    if land_type in ['\tResidential Plot with Corp./Mun./Panch. road access',
                     'Residential Plot with Corp./Mun./Panch. road access']:
        return 'Residential Plot with Corp./Mun./Panch. road access'
    elif land_type in ['Residential Plot without vehicular access\t',
                       '\tResidential Plot without vehicular access']:
        return 'Residential Plot without vehicular access'
    elif land_type in ['\tWet land', 'Wet land\t', 'Wet Land']:
        return 'Wet land'
    elif land_type == 'Garden Land without road access\t':
        return 'Garden Land without road access'
    elif land_type == '\tResidential Plot with NH/PWD road access':
        return 'Residential Plot with NH/PWD road access'
    elif land_type == '\tCommercially important Plot':
        return 'Commercially important Plot'
    elif land_type == '\tWater logged land':
        return 'Water logged land'
    else:
        return land_type

# Decode land type
land_type = decode_land_type(land_type)

# Create DataFrame for user input
user_input = pd.DataFrame({
    'survey_number': [survey_number],
    'land_type': [land_type],
    'fair_value': [fair_value],
})

# Perform the conversion to encode categorical data
user_input = pd.get_dummies(user_input, columns=["land_type"])

# Ensure that user input DataFrame has the same columns as the training data
missing_cols = set(x.columns) - set(user_input.columns)
for col in missing_cols:
    user_input[col] = 0

user_input = user_input[x.columns]

# Make prediction using the model
predicted_market_value = loaded_model.predict(user_input)
print("Predicted Market Value:", predicted_market_value[0])



Enter survey number: 206
Enter land type: wet land
Enter fair value: 902942
Predicted Market Value: 596307.626167081
