### Importing libraries ###

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score


*Getting Data*

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head()

*Stats of data*

In [None]:
df.shape

In [None]:
df.info() 

In [None]:
df.describe()

In [None]:
df.columns

*Grouping all categories in categorical column and adding them*

In [None]:
for col in df.columns:
    if (df[col].dtype=="object"):
       a =df[col].value_counts()
       print(a)

*Removing columns where missing value is more than 60%*

In [None]:
columns_with_missing_values = df.columns[df.isnull().sum()/1460*100 >60].tolist()
print(columns_with_missing_values)

In [None]:
new_df =df.drop(columns=columns_with_missing_values,axis=1)

In [None]:
new_df.isnull().sum()

*Numerical columns*

In [None]:
numerical_columns = new_df.select_dtypes(include="number").columns.to_list()
numerical_columns

*categorical columns*

In [None]:
categorical_columns = new_df.select_dtypes(exclude="number").columns.to_list()
categorical_columns

*Filling missing values*

In [None]:
# Calculate the mean for each numerical column
mean_values = new_df[numerical_columns].mean()

# Fill missing values with the mean
new_df[numerical_columns] = new_df[numerical_columns].fillna(mean_values)

In [None]:
# Calculate correlations
correlations = new_df[['LotArea','TotalBsmtSF','BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','SalePrice']].corr()

# Visualize correlations using a heatmap
sns.heatmap(correlations, annot=False, cmap=sns.cubehelix_palette(as_cmap=True))
plt.title('Correlation Heatmap')
plt.show()


*Splitting values into feature and target variables*

In [None]:
X = new_df[['TotalBsmtSF','FullBath','HalfBath','BsmtHalfBath','BsmtFullBath','LotArea','BedroomAbvGr']].values
y = new_df['SalePrice'].values

*Splitting into train and test*

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 20)

### Model building ###

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
rmse = mean_squared_error(y_test,y_pred,squared=False)
rmse
r2 = r2_score(y_test,y_pred)
r2

In [None]:
print(f"The model accuracy is {round(r2*100,2)} %")