In [1]:
# Importing required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer



In [2]:
# Read the train and test data
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [3]:
# Display the first few rows of the train data
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Get information about the train data, including data types and missing values
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# Separate features (X) and target (y) columns
X = train_data.drop('SalePrice', axis=1)
y = train_data['SalePrice']

In [6]:
# Select numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [7]:
# Impute missing values for numeric features with the mean
numeric_imputer = SimpleImputer(strategy='mean')
X[numeric_features] = numeric_imputer.fit_transform(X[numeric_features])
test_data[numeric_features] = numeric_imputer.transform(test_data[numeric_features])

In [8]:
# Impute missing values for categorical features with the most frequent value
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])
test_data[categorical_features] = categorical_imputer.transform(test_data[categorical_features])

In [9]:
# Encode categorical features using OneHotEncoder
encoder = OneHotEncoder(drop='first')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_features]).toarray())
test_data_encoded = pd.DataFrame(encoder.transform(test_data[categorical_features]).toarray())


In [10]:
# Concatenate encoded categorical features with numeric features
X = pd.concat([X.drop(categorical_features, axis=1), X_encoded], axis=1)
test_data = pd.concat([test_data.drop(categorical_features, axis=1), test_data_encoded], axis=1)

In [11]:
# Standardize numeric features
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])
test_data[numeric_features] = scaler.transform(test_data[numeric_features])

In [12]:
# Convert column names to strings
X.columns = X.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

In [13]:
# Reduce dimensionality using PCA
pca = PCA(n_components=0.95)  
X_pca = pd.DataFrame(pca.fit_transform(X))
test_data_pca = pd.DataFrame(pca.transform(test_data))

In [14]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [15]:
# Import TensorFlow and define the neural network model
import tensorflow as tf
from tensorflow.keras import layers, models




In [16]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.2),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])




In [17]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')




In [18]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x1735e321730>

In [19]:
# Make predictions on the validation set
val_predictions = model.predict(X_val)



In [20]:
# Convert predictions back to original scale
val_predictions = np.expm1(np.clip(val_predictions, -10, 10))

In [21]:
# Calculate root mean squared error on the validation set
rmse = np.sqrt(mean_squared_error(train_data['SalePrice'].loc[y_val.index], val_predictions))
print(f'Root Mean Squared Error on Validation Set: {rmse}')

Root Mean Squared Error on Validation Set: 179613.6564115453


In [22]:
# Make predictions on the test set
test_predictions = model.predict(test_data_pca)



In [23]:
# Convert predictions back to original scale
test_predictions = np.expm1(np.clip(test_predictions, -10,10))

In [24]:
# Create submission file
sample_submission_df = pd.read_csv('./sample_submission.csv')
sample_submission_df['SalePrice'] = test_predictions
sample_submission_df.to_csv('./submission.csv', index=False)
sample_submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,22025.464844
1,1462,22025.464844
2,1463,22025.464844
3,1464,22025.464844
4,1465,22025.464844
