In [None]:
# Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
# Load the datasets
sample_submission = pd.read_csv('House price Prediction/Participants_Data_HPP/sample_submission.csv')
test_data = pd.read_csv('House price Prediction/Participants_Data_HPP/Test.csv')
train_data = pd.read_csv('House price Prediction/Participants_Data_HPP/Train.csv')

# Check the columns in the training dataset
print("Columns in the training dataset:")
print(train_data.columns)

# Check the columns in the test dataset
print("Columns in the test dataset:")
print(test_data.columns)

# Print first few rows of the datasets
print("First few rows of the train dataset:")
print(train_data.head())

print("\nFirst few rows of the test dataset:")
print(test_data.head())
# Split the training dataset into features (X_train) and target variable (y_train)
X_train = train_data.drop('TARGET(PRICE_IN_LACS)', axis=1)  # Features
y_train = train_data['TARGET(PRICE_IN_LACS)']  # Target variable

# Split the test dataset into features (X_test)
X_test = test_data

# Convert categorical variables into dummy/indicator variables (one-hot encoding)
X_train_encoded = pd.get_dummies(X_train, columns=['POSTED_BY', 'BHK_OR_RK'], drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=['POSTED_BY', 'BHK_OR_RK'], drop_first=True)

# Drop the 'ADDRESS' column from both the training and test datasets
X_train_encoded = X_train_encoded.drop('ADDRESS', axis=1)
X_test_encoded = X_test_encoded.drop('ADDRESS', axis=1)

# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train_encoded, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_encoded)

# Convert the predicted prices arrays to DataFrames with appropriate row and column names
test_predictions_df = pd.DataFrame({'Predicted Price for Test Set': y_pred})

# Print the predicted prices for the test set
print("Predicted prices for the test set:")
print(test_predictions_df)

# Make predictions on the training set
y_train_pred = model.predict(X_train_encoded)

# Convert the predicted prices arrays to DataFrames with appropriate row and column names
train_predictions_df = pd.DataFrame({'Predicted Price for Training Set': y_train_pred})

# Print the predicted prices for the training set
print("\nPredicted prices for the training set:")
print(train_predictions_df)

# Save predictions to a CSV file
sample_submission['TARGET(PRICE_IN_LACS)'] = y_pred
sample_submission.to_csv('predicted_prices.csv', index=False)

# You can print or inspect the predicted prices if needed
print("\nPredicted prices:", y_pred)

# Scatter plot for actual vs predicted prices in the training set
plt.figure(figsize=(10, 5))
plt.scatter(y_train, y_train_pred, color='blue', alpha=0.5)
plt.title('Actual vs Predicted Prices in Training Set')
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.grid(True)
plt.show()

# Define the number of bins for the histogram
num_bins = 20

# Calculate histogram bins and frequencies
hist, bins = np.histogram(test_predictions_df['Predicted Price for Test Set'], bins=num_bins)

# Plot the bar graph
plt.figure(figsize=(10, 6))
plt.bar(bins[:-1], hist, width=(bins[1]-bins[0]), color='green', alpha=0.7)
plt.title('Distribution of Predicted Prices in Test Set')
plt.xlabel('Predicted Prices')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# pie-chart
# Define the bins for the pie chart (price ranges)
bins = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

# Calculate the frequencies of predictions falling into each price range
freq, _ = np.histogram(test_predictions_df['Predicted Price for Test Set'], bins=bins)

# Define the labels for the pie chart
labels = [f'{bins[i]}-{bins[i+1]}' for i in range(len(bins)-1)]

# Plot the pie chart with adjusted figure size and font size
plt.figure(figsize=(10, 10))
plt.title('Distribution of Predicted Prices in Test Set', fontsize=16)
plt.pie(freq, labels=labels, autopct='%1.1f%%', startangle=140, textprops={'fontsize': 12})
plt.axis('equal')
plt.show()