In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA

In [98]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [99]:
# Load dataset
df = pd.read_csv('C:/Users/SONY/Machine_Learning_Project/Processed_Data/gurgaon_properties_post_feature_selection_v2.csv').drop(columns=['store room','floor_category','balcony'])

In [100]:
df.head()

Unnamed: 0,property_type,price,bedRoom,bathroom,agePossession,sector,built_up_area,servant room,furnishing_type,luxury_category
0,flat,1.75,3.0,3,Relatively New,sector 83,1600.0,1,2,High
1,flat,1.4,3.0,3,Relatively New,sector 50,1239.0,0,2,Medium
2,flat,2.13,4.0,4,Relatively New,sector 85,2600.0,0,0,Low
3,house,6.25,5.0,7,Relatively New,sector 109,6228.0,1,0,Medium
4,flat,1.1,3.0,3,Relatively New,sector 84,1575.0,0,0,Medium


In [101]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [102]:
# Numerical = bedRoom, bathroom, built_up_area, servant room
# Ordinal = property_type, furnishing_type, luxury_category 
# OHE = sector, agePossession

In [103]:
df['agePossession'] = df['agePossession'].replace(
    {
        'Relatively New':'new',
        'Moderately Old':'old',
        'New Property' : 'new',
        'Old Property' : 'old',
        'Under Construction' : 'under construction'
    }
)

In [104]:
df.head()

Unnamed: 0,property_type,price,bedRoom,bathroom,agePossession,sector,built_up_area,servant room,furnishing_type,luxury_category
0,flat,1.75,3.0,3,new,sector 83,1600.0,1,2,High
1,flat,1.4,3.0,3,new,sector 50,1239.0,0,2,Medium
2,flat,2.13,4.0,4,new,sector 85,2600.0,0,0,Low
3,house,6.25,5.0,7,new,sector 109,6228.0,1,0,Medium
4,flat,1.1,3.0,3,new,sector 84,1575.0,0,0,Medium


In [105]:
# Preprocessing
numerical_features = ['bedRoom', 'bathroom', 'built_up_area', 'servant room']
ordinal_features = ['property_type', 'furnishing_type', 'luxury_category']
ohe_features = ['sector', 'agePossession']

In [106]:
# Define transformers
numerical_transformer = StandardScaler()
ordinal_transformer = OrdinalEncoder()
ohe_transformer = OneHotEncoder(handle_unknown='ignore')

In [107]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('ohe', ohe_transformer, ohe_features)
    ])

In [108]:
# Prepare your data
X = df.drop(columns=['price'])

In [109]:
y = df['price']

In [110]:
y_log = np.log1p(y)  # Apply log transformation to the target variable

In [111]:
y_log

0       1.011601
1       0.875469
2       1.141033
3       1.981001
4       0.741937
5       1.098612
6       1.095273
7       0.405465
8       0.636577
9       0.625938
10      0.788457
11      1.178655
12      0.854415
13      1.504077
14      0.683097
15      1.686399
16      0.741937
17      1.589235
18      0.641854
19      1.658228
20      0.598837
21      0.875469
22      0.693147
23      0.300105
24      0.667829
25      0.770108
26      0.920283
27      0.810930
28      0.615186
29      1.098612
30      0.883768
31      2.397895
32      1.064711
33      0.974560
34      0.841567
35      0.657520
36      0.392042
37      0.741937
38      0.239017
39      0.067659
40      0.936093
41      1.040277
42      2.944439
43      0.936093
44      2.393339
45      0.732368
46      0.300105
47      0.765468
48      0.482426
49      0.832909
50      1.791759
51      1.178655
52      0.615186
53      0.500775
54      0.810930
55      0.615186
56      0.760806
57      0.693147
58      0.7419

In [112]:
# Create the model pipeline with LinearRegression
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [113]:
# Create a new pipeline with Ridge regression
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=100))
])

In [114]:
# Train-test split
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [115]:
# Fit and evaluate the model pipeline using cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [116]:
scores = cross_val_score(model_pipeline, X, y_log, cv=kfold, scoring='r2')

In [117]:
print(f'Linear Regression - Mean R2 Score: {scores.mean():.3f}, Std Dev: {scores.std():.3f}')

Linear Regression - Mean R2 Score: 0.837, Std Dev: 0.027


In [118]:
# Fit and evaluate the Ridge regression pipeline using cross-validation
ridge_scores = cross_val_score(ridge_pipeline, X, y_log, cv=kfold, scoring='r2')
print(f'Ridge Regression - Mean R2 Score: {ridge_scores.mean():.3f}, Std Dev: {ridge_scores.std():.3f}')

Ridge Regression - Mean R2 Score: 0.764, Std Dev: 0.032


In [119]:
# Fit the final model
model_pipeline.fit(X_train, y_train_log)

In [120]:
import pickle
# Save the model pipeline to a file
with open('C:/Users/SONY/Machine_Learning_Project/Processed_Data/model_pipeline.pkl', 'wb') as file:
    pickle.dump(model_pipeline, file)

In [121]:
# Make predictions
y_pred_log = model_pipeline.predict(X_test)
y_pred = np.expm1(y_pred_log)  # Convert back from log scale

In [122]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(np.expm1(y_test_log), y_pred)

In [123]:
print(f'Mean Absolute Error: {mae:.2f}')

Mean Absolute Error: 0.66


In [124]:
# Extract coefficients
model = model_pipeline.named_steps['regressor']
preprocessor = model_pipeline.named_steps['preprocessor']

In [125]:
numerical_feature_names = preprocessor.transformers_[0][1].get_feature_names_out()
categorical_feature_names = preprocessor.transformers_[1][1].categories_[0].tolist()
ohe_feature_names = preprocessor.transformers_[2][1].get_feature_names_out()

print("Numerical features:", numerical_feature_names)
print("Categorical features:", categorical_feature_names)
print("OHE features:", ohe_feature_names)

Numerical features: ['bedRoom' 'bathroom' 'built_up_area' 'servant room']
Categorical features: ['flat', 'house']
OHE features: ['sector_a block sushant lok phase 1' 'sector_b block sushant lok phase 1'
 'sector_bhondsi' 'sector_c block sushant lok phase 1'
 'sector_garhi harsaru' 'sector_laxmi garden' 'sector_sector 1'
 'sector_sector 10' 'sector_sector 102' 'sector_sector 103'
 'sector_sector 104' 'sector_sector 105' 'sector_sector 106'
 'sector_sector 107' 'sector_sector 108' 'sector_sector 109'
 'sector_sector 11' 'sector_sector 110' 'sector_sector 111'
 'sector_sector 112' 'sector_sector 113' 'sector_sector 12'
 'sector_sector 12a' 'sector_sector 13' 'sector_sector 14'
 'sector_sector 15' 'sector_sector 17' 'sector_sector 1a'
 'sector_sector 2' 'sector_sector 21' 'sector_sector 22'
 'sector_sector 23' 'sector_sector 24' 'sector_sector 25'
 'sector_sector 26' 'sector_sector 27' 'sector_sector 28'
 'sector_sector 3' 'sector_sector 30' 'sector_sector 31'
 'sector_sector 33' 'sector_s

In [126]:
# Extract feature names after transformation
numerical_feature_names = preprocessor.transformers_[0][1].get_feature_names_out(numerical_features).tolist()
ordinal_feature_names = preprocessor.transformers_[1][1].categories_[0].tolist()
ohe_feature_names = preprocessor.transformers_[2][1].get_feature_names_out(ohe_features)

In [127]:
print(numerical_feature_names)
print(ordinal_feature_names)
print(ohe_feature_names)

['bedRoom', 'bathroom', 'built_up_area', 'servant room']
['flat', 'house']
['sector_a block sushant lok phase 1' 'sector_b block sushant lok phase 1'
 'sector_bhondsi' 'sector_c block sushant lok phase 1'
 'sector_garhi harsaru' 'sector_laxmi garden' 'sector_sector 1'
 'sector_sector 10' 'sector_sector 102' 'sector_sector 103'
 'sector_sector 104' 'sector_sector 105' 'sector_sector 106'
 'sector_sector 107' 'sector_sector 108' 'sector_sector 109'
 'sector_sector 11' 'sector_sector 110' 'sector_sector 111'
 'sector_sector 112' 'sector_sector 113' 'sector_sector 12'
 'sector_sector 12a' 'sector_sector 13' 'sector_sector 14'
 'sector_sector 15' 'sector_sector 17' 'sector_sector 1a'
 'sector_sector 2' 'sector_sector 21' 'sector_sector 22'
 'sector_sector 23' 'sector_sector 24' 'sector_sector 25'
 'sector_sector 26' 'sector_sector 27' 'sector_sector 28'
 'sector_sector 3' 'sector_sector 30' 'sector_sector 31'
 'sector_sector 33' 'sector_sector 36' 'sector_sector 36a'
 'sector_sector 37' 'se

In [128]:
# Check the lengths of the feature name lists
print("Numerical feature names length:", len(numerical_feature_names))
print("Ordinal feature names length:", len(ordinal_feature_names))
print("One-Hot Encoded feature names length:", len(ohe_feature_names))

Numerical feature names length: 4
Ordinal feature names length: 2
One-Hot Encoded feature names length: 111


In [129]:
# Extract feature names from the ColumnTransformer
feature_names = preprocessor.get_feature_names_out()

# Extract coefficients
coefficients = model.coef_

# Print lengths and first few feature names
print("Total feature names length:", len(feature_names))
print("Number of coefficients:", len(coefficients))

# Verify if lengths match
if len(feature_names) != len(coefficients):
    print("Mismatch between number of features and number of coefficients")
else:
    print("Feature names and coefficients are aligned.")

# Create a DataFrame for coefficients
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df

Total feature names length: 118
Number of coefficients: 118
Feature names and coefficients are aligned.


Unnamed: 0,Feature,Coefficient
0,num__bedRoom,0.069543
1,num__bathroom,0.05699
2,num__built_up_area,0.209018
3,num__servant room,0.054329
4,ord__property_type,0.257308
5,ord__furnishing_type,0.015792
6,ord__luxury_category,-0.00501
7,ohe__sector_a block sushant lok phase 1,0.303233
8,ohe__sector_b block sushant lok phase 1,0.678238
9,ohe__sector_bhondsi,-0.73121


In [130]:
# Save the DataFrame as a pickle file
coef_df.to_pickle('coef_df1.pkl')

In [95]:
coef_df['Feature'].values

array(['num__bedRoom', 'num__bathroom', 'num__built_up_area',
       'num__servant room', 'ord__property_type', 'ord__furnishing_type',
       'ord__luxury_category', 'ohe__sector_a block sushant lok phase 1',
       'ohe__sector_b block sushant lok phase 1', 'ohe__sector_bhondsi',
       'ohe__sector_c block sushant lok phase 1',
       'ohe__sector_garhi harsaru', 'ohe__sector_laxmi garden',
       'ohe__sector_sector 1', 'ohe__sector_sector 10',
       'ohe__sector_sector 102', 'ohe__sector_sector 103',
       'ohe__sector_sector 104', 'ohe__sector_sector 105',
       'ohe__sector_sector 106', 'ohe__sector_sector 107',
       'ohe__sector_sector 108', 'ohe__sector_sector 109',
       'ohe__sector_sector 11', 'ohe__sector_sector 110',
       'ohe__sector_sector 111', 'ohe__sector_sector 112',
       'ohe__sector_sector 113', 'ohe__sector_sector 12',
       'ohe__sector_sector 12a', 'ohe__sector_sector 13',
       'ohe__sector_sector 14', 'ohe__sector_sector 15',
       'ohe__sector_s

In [93]:
import pickle
# Save the model pipeline to a file
with open('C:/Users/SONY/Machine_Learning_Project/Processed_Data/coef_df.pkl', 'wb') as file:
    pickle.dump(model_pipeline, file)

In [None]:
df.head()

In [None]:
# Verify lengths
print(f"Number of features: {len(feature_names)}")
print(f"Number of coefficients: {len(coefficients)}")

In [None]:
# Function to calculate mean price for selected feature values
def calculate_mean_price(df, feature, values):
    mean_prices = {}
    for value in values:
        mean_prices[value] = df[df[feature] == value]['price'].mean()
    return mean_prices

In [None]:
# Function to calculate price difference and percentage change
def calculate_price_difference_and_change(mean_prices):
    values = list(mean_prices.keys())
    price_diff = mean_prices[values[1]] - mean_prices[values[0]]
    percentage_change = (price_diff / mean_prices[values[0]]) * 100
    return price_diff, percentage_change

In [293]:
# Function to calculate expected price change based on coefficients
def calculate_price_change(coefficients, feature_name_from, feature_name_to, feature_names):
    feature_index_from = feature_names.index(feature_name_from)
    feature_index_to = feature_names.index(feature_name_to)
    change_in_price = coefficients[feature_index_to] - coefficients[feature_index_from]
    return change_in_price

In [294]:
# Example usage for sector change
mean_prices_sectors = calculate_mean_price(df, 'sector', ['sector 107', 'sector 109'])
price_diff, percentage_change = calculate_price_difference_and_change(mean_prices_sectors)

In [297]:
# Convert feature_names to a list
feature_names_list = feature_names.tolist()

# Example usage for sector change
sector_feature_name_from = 'ohe__sector_sector 107'  # Adjust as per the actual feature name in coefficients
sector_feature_name_to = 'ohe__sector_sector 109'   # Adjust as per the actual feature name in coefficients

# Calculate expected price change due to sector change
change_in_price = calculate_price_change(coefficients, sector_feature_name_from, sector_feature_name_to, feature_names_list)

print(f"Expected Price Change due to Sector Change: {change_in_price:.2f}")
print(f"Mean Price Difference: {price_diff:.2f}")
print(f"Percentage Change: {percentage_change:.2f}%")

Expected Price Change due to Sector Change: 0.31
Mean Price Difference: 3.21
Percentage Change: 413.70%


In [298]:
# Example usage for bedroom change
mean_prices_bedrooms = calculate_mean_price(df, 'bedRoom', [2, 4])
price_diff_bedroom, percentage_change_bedroom = calculate_price_difference_and_change(mean_prices_bedrooms)

bedroom_feature_name_from = 'num_bedRoom'  # Adjust as per the actual feature name in coefficients
bedroom_feature_name_to = 'num_bedRoom'    # Same feature, but different values

print(f"Expected Price Change due to Bedroom Change: {change_in_price:.2f}")
print(f"Mean Price Difference for Bedrooms: {price_diff_bedroom:.2f}")
print(f"Percentage Change in Bedrooms: {percentage_change_bedroom:.2f}%")

Expected Price Change due to Bedroom Change: 0.31
Mean Price Difference for Bedrooms: 2.69
Percentage Change in Bedrooms: 266.38%


In [29]:
df.head()

Unnamed: 0,property_type,price,bedRoom,bathroom,agePossession,sector,built_up_area,servant room,furnishing_type,luxury_category
0,flat,1.75,3.0,3,new,sector 83,1600.0,1,2,High
1,flat,1.4,3.0,3,new,sector 50,1239.0,0,2,Medium
2,flat,2.13,4.0,4,new,sector 85,2600.0,0,0,Low
3,house,6.25,5.0,7,new,sector 109,6228.0,1,0,Medium
4,flat,1.1,3.0,3,new,sector 84,1575.0,0,0,Medium


In [300]:
bedroom_coefficient=0.069543

In [303]:
# Calculate expected price change when increasing bedroom from 3 to 4
price_change_bedroom = bedroom_coefficient * (4 - 3)

mean_prices_bedrooms = calculate_mean_price(df, 'bedRoom', [2, 4])
price_diff_bedroom, percentage_change_bedroom = calculate_price_difference_and_change(mean_prices_bedrooms)

formatted_price_diff_bedroom = price_diff_bedroom
formatted_percentage_change_bedroom = percentage_change_bedroom

print(f"Mean Price Difference for Bedrooms: {formatted_price_diff_bedroom:.2f}")
print(f"Percentage Change in Bedrooms: {formatted_percentage_change_bedroom:.2f}%")

Mean Price Difference for Bedrooms: 2.69
Percentage Change in Bedrooms: 266.38%


In [None]:
num__bedRoom)


In [305]:
def get_coefficient(feature_name, coefficient_dict):
    return coefficient_dict.get(feature_name, None)

# Example usage
bedroom_coefficient_name = 'num__bedRoom'
bedroom_coefficient = get_coefficient(bedroom_coefficient_name, coef_df)

In [315]:
import pandas as pd
def get_coefficient(feature_name, coef_df):
    """Retrieve the coefficient for a given feature from the DataFrame."""
    row = coef_df[coef_df['Feature'] == feature_name]
    if not row.empty:
        return row['Coefficient'].values[0]
    else:
        print(f"Coefficient for '{feature_name}' not found.")
        return None

def calculate_price_change(coefficient, from_value, to_value):
    if coefficient is None:
        raise ValueError("Coefficient is None. Ensure the feature name is correct.")
    change_in_feature = to_value - from_value
    return coefficient * change_in_feature
    
# Example usage for bedroom change
bedroom_feature_name = 'num__bedRoom'  # Ensure this matches the feature in coef_df
bedroom_coefficient = get_coefficient(bedroom_feature_name, coef_df)

from_bedrooms = 2
to_bedrooms = 4

price_change_bedroom = calculate_price_change(bedroom_coefficient, from_bedrooms, to_bedrooms)
formatted_price_change_bedroom = price_change_bedroom
print(f"Expected Price Change due to Bedroom Change from {from_bedrooms} to {to_bedrooms}: {formatted_price_change_bedroom:.2f}")

Expected Price Change due to Bedroom Change from 2 to 4: 0.14
