<a href="https://colab.research.google.com/github/23epstein/AI-Linear-Regression-Test/blob/main/ai_regression_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('house_price_data.csv')  # replace with the actual filename if different
df.head()

In [None]:
df.info()
df.isnull().sum()

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from both columns
df_clean = remove_outliers_iqr(df, 'SqFt Area')
df_clean = remove_outliers_iqr(df_clean, 'Price (INR in Lakhs)')

# Show shape and preview cleaned data
print("Original dataset size:", df.shape[0])
print("Cleaned dataset size:", df_clean.shape[0])
df_clean.head()


In [None]:
from sklearn.model_selection import train_test_split

# Define feature and target
X = df_clean[['SqFt Area']]  # Feature (independent variable)
y = df_clean['Price (INR in Lakhs)']  # Target (dependent variable)

# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display shapes of resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model using the training data
model.fit(X_train, y_train)

# Generate predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Print model coefficients
print("Slope (Coefficient):", model.coef_[0])
print("Intercept:", model.intercept_)

In [None]:
# Generate predictions
y_pred = model.predict(X_test)

# Show predicted vs actual values
predicted_vs_actual = pd.DataFrame({
    'SqFt Area': X_test['SqFt Area'],
    'Actual Price': y_test,
    'Predicted Price': y_pred
})

predicted_vs_actual.sort_values(by='SqFt Area').head(10)  # Display top 10 sorted rows

In [None]:
from sklearn.metrics import mean_squared_error

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error (MSE):", round(mse, 2))


In [None]:
import numpy as np

# Calculate RMSE
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", round(rmse, 2))


In [None]:
from sklearn.metrics import r2_score

# Calculate R² score
r2 = r2_score(y_test, y_pred)
print("R² Score:", round(r2, 3))


In [None]:
import matplotlib.pyplot as plt

# Scatter plot of actual vs. predicted prices
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', edgecolor='k', alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)  # diagonal line
plt.xlabel('Actual Price (INR in Lakhs)')
plt.ylabel('Predicted Price (INR in Lakhs)')
plt.title('Actual vs. Predicted House Prices')
plt.grid(True)
plt.show()