In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# Load the dataset
data_path = '/content/Bengaluru_House_Data.csv'
df = pd.read_csv(data_path)

# Handle missing values
df = df.dropna(subset=['location', 'size', 'total_sqft', 'bath', 'balcony', 'price'])

# Split the 'size' column into 'bedrooms'
df['bedrooms'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Drop unnecessary columns
df = df.drop(columns=['area_type', 'availability', 'society', 'size'])

# Convert 'total_sqft' to numeric (assuming it is in the format 'number - number')
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Drop rows with invalid 'total_sqft'
df = df.dropna(subset=['total_sqft'])

# Split the dataset into features and target variable
X = df.drop(columns=['price'])
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
numeric_features = ['total_sqft', 'bath', 'balcony', 'bedrooms']
categorical_features = ['location']

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline that first preprocesses the data and then trains the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# Tuning the hyperparameters of the SVR using GridSearchCV
param_grid = {
    'regressor__C': [0.1, 1, 10, 100],
    'regressor__epsilon': [0.1, 0.2, 0.5, 1],
    'regressor__kernel': ['linear', 'poly', 'rbf'],
    'regressor__gamma': ['scale', 'auto']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

# Train the tuned classifier on the entire dataset
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict the labels for the testing set
y_pred = best_model.predict(X_test)

# Evaluate the performance of the classifier
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

# Save the trained classifier to a file for future use
joblib_file = "svr_model.pkl"
joblib.dump(best_model, joblib_file)
