In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv("test.csv")
train_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
# checking columns with missing values
print(train_df.columns[train_df.isnull().any()])
print("\n")
print(test_df.columns[test_df.isnull().any()])

## Data Preprocessing

In [None]:
# Fill missing values and encode categorical data
for column in train_df.columns:
    if train_df[column].dtype == 'object':
        train_df[column] = train_df[column].fillna(train_df[column].mode()[0])
    else:
        train_df[column] = train_df[column].fillna(train_df[column].median())

train_df = pd.get_dummies(train_df)

## Visualize Correlations using Heatmap¶


In [None]:
# Calculate correlation matrix
corr_matrix = train_df.corr()

# Plot heatmap
plt.figure(figsize=(20,20))  # Adjust size as necessary for visibility
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
plt.title('Heatmap of Feature Correlation')
plt.show()

# Select features based on a correlation threshold
threshold = 0.05  # Adjust the threshold as needed
selected_features = corr_matrix.index[abs(corr_matrix["SalePrice"]) > threshold].tolist()
selected_features.remove('SalePrice')  # Remove the target variable from features list

## splitting data into training and test

In [None]:
X = train_df[selected_features]
y = np.log(train_df['SalePrice'])  # Log-transform the target variable
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Model training with linear regression

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

## Feature Importance from Linear Regression¶


In [None]:
feature_importance = pd.DataFrame(linear_model.coef_, X.columns, columns=['Coefficient'])
top_features = feature_importance.abs().sort_values(by='Coefficient', ascending=True).head(10)
top_features.plot(kind='barh')
plt.title('Top 10 Important Features for House Price Prediction')
plt.show()

In [None]:
# Predictions for validation set using Linear Regression
y_pred_linear = linear_model.predict(X_val)

## Model Evaluation for Linear Regression¶


In [None]:
# Predictions for validation set using Linear Regression
y_pred_linear = linear_model.predict(X_val)

# Ensure that the test data includes all columns used in the model, adding missing ones with a default value of 0
needed_columns = X.columns
test_df_with_all_columns = test_df.reindex(columns=needed_columns, fill_value=0)

# Additionally, ensure there are no NaN values across the dataframe 
test_df_with_all_columns.fillna(0, inplace=True)

# Use the corrected DataFrame for predictions
final_test = test_df_with_all_columns

test_predictions = linear_model.predict(final_test)

# Convert predictions back from log scale if the target was transformed during training
final_predictions = np.exp(test_predictions)

# Create and save the submission file
predicted_result = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': final_predictions
})
predicted_result

In [None]:
predicted_result.to_csv('submission.csv', index=False)
print("Task 1 completed")