In [3]:
import pandas as pd

# Load dataset
file_path = "india_housing_prices.csv"
df = pd.read_csv(file_path)

# Display basic information
print(df.info())

# Show first 5 rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1499 entries, 0 to 1498
Data columns (total 23 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ID                              1499 non-null   int64  
 1   State                           1499 non-null   object 
 2   City                            1499 non-null   object 
 3   Locality                        1499 non-null   object 
 4   Property_Type                   1499 non-null   object 
 5   BHK                             1499 non-null   int64  
 6   Size_in_SqFt                    1499 non-null   int64  
 7   Price_in_Lakhs                  1499 non-null   float64
 8   Price_per_SqFt                  1499 non-null   float64
 9   Year_Built                      1499 non-null   int64  
 10  Furnished_Status                1499 non-null   object 
 11  Floor_No                        1499 non-null   int64  
 12  Total_Floors                    14

Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handling missing values
df.dropna(inplace=True)

# Encoding categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Splitting dataset into features (X) and target variable (y)
X = df.drop(columns=['Price_in_Lakhs'])  #Assuming "Price_in_Lakhs" is the target variable 
y = df['Price_in_Lakhs']

# Normalizing features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Data Preprocessing Completed!")


Data Preprocessing Completed!


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"Model Performance:\nMAE: {mae}\nMSE: {mse}")


Model Performance:
MAE: 86.83774154652068
MSE: 11990.617670639025


In [8]:
#Save the Model for Flask
import joblib

# Save the model
joblib.dump(model, "house_price_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

print("Model saved successfully!")


Model saved successfully!
