In [5]:
#Preprocessed data in Assignment 4
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

file_path = '/content/drive/My Drive/CSE422 LAB/Assignment 4 (ML)/Dataset/Housing Price.xlsx'
df = pd.read_excel(file_path)

# Task 1: Remove null values
df_cleaned = df.dropna()

# Task 2: Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Task 3: Handle categorical variables (Binary encoding and One-Hot Encoding)
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df_cleaned[binary_columns] = df_cleaned[binary_columns].replace({'yes': 1, 'no': 0})

# One-Hot Encoding for 'furnishingstatus'
df_cleaned = pd.get_dummies(df_cleaned, columns=['furnishingstatus'], drop_first=True)

# Task 4: Feature scaling for continuous variables
scaler = StandardScaler()
df_cleaned[['price', 'area', 'parking']] = scaler.fit_transform(df_cleaned[['price', 'area', 'parking']])

# Task 5: Remove variables with high correlation (threshold > 0.8)
corr_matrix = df_cleaned.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
df_cleaned.drop(columns=to_drop, inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

# Step 3: Separate target variable (price) and features for linear regression
X = df_cleaned.drop(columns=['price'])
y = df_cleaned['price']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Apply Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Step 6: Evaluate Linear Regression (MSE, RMSE)
mse = mean_squared_error(y_test, y_pred_linear)
rmse = np.sqrt(mse)
print(f"Linear Regression MSE: {mse}")
print(f"Linear Regression RMSE: {rmse}")

# Step 7: Convert 'price' into binary categories for Logistic Regression
threshold = y.median()
y_binary = (y > threshold).astype(int)

# Step 8: Split the data into training and testing sets for logistic regression
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y_binary, test_size=0.3, random_state=42)

# Step 9: Apply Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train_log, y_train_log)
y_pred_log = logistic_model.predict(X_test_log)

# Step 10: Evaluate Logistic Regression (Accuracy)
accuracy = accuracy_score(y_test_log, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy}")

Linear Regression MSE: 0.5558727137233054
Linear Regression RMSE: 0.7455687183105963
Logistic Regression Accuracy: 0.8509316770186336


In [None]:
# Step 1: Load and preprocess data as done in Assignment 4
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score

drive.mount('/content/drive')

# Load dataset from Google Drive
file_path = '/content/drive/My Drive/CSE422 LAB/Assignment 4 (ML)/Dataset/Housing Price.xlsx'
df = pd.read_excel(file_path)

# Step 2: Preprocess the data (same as in Assignment 4)
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()

binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df_cleaned[binary_columns] = df_cleaned[binary_columns].replace({'yes': 1, 'no': 0})

df_cleaned = pd.get_dummies(df_cleaned, columns=['furnishingstatus'], drop_first=True)

scaler = StandardScaler()
df_cleaned[['price', 'area', 'parking']] = scaler.fit_transform(df_cleaned[['price', 'area', 'parking']])

# Remove highly correlated features
corr_matrix = df_cleaned.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
df_cleaned.drop(columns=to_drop, inplace=True)

# Step 3: Separate target variable (price) and features for linear regression
X = df_cleaned.drop(columns=['price'])
y = df_cleaned['price']

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 5: Apply Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

# Step 6: Evaluate Linear Regression (MSE, RMSE)
mse = mean_squared_error(y_test, y_pred_linear)
rmse = np.sqrt(mse)
print(f"Linear Regression MSE: {mse}")
print(f"Linear Regression RMSE: {rmse}")

# Step 7: Convert 'price' into binary categories for Logistic Regression
threshold = y.median()  # You can choose any threshold
y_binary = (y > threshold).astype(int)

# Step 8: Split the data into training and testing sets for logistic regression
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y_binary, test_size=0.3, random_state=42)

# Step 9: Apply Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train_log, y_train_log)
y_pred_log = logistic_model.predict(X_test_log)

# Step 10: Evaluate Logistic Regression (Accuracy)
accuracy = accuracy_score(y_test_log, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy}")


Mounted at /content/drive
Linear Regression MSE: 0.5558727137233054
Linear Regression RMSE: 0.7455687183105963
Logistic Regression Accuracy: 0.8509316770186336
