In [None]:
!pip install ydata-profiling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from ydata_profiling import ProfileReport

In [None]:
df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

In [None]:
rep = ProfileReport(df1)
rep.to_file(output_file='report.html')



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]



Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# **Feature Engineering**

In [None]:
df1.isnull().mean()*100

In [None]:
#df1['SalePrice'].corr(df1)
#df1.info
numerical = df1.select_dtypes(include=np.number)
categorical = df1.select_dtypes(include=object)

## **Categorical Features**

In [None]:
# Correct any typos or case mismatches in the list of columns to drop
columns_to_drop = ['MiscFeature', 'Fence', 'PoolQC', 'FireplaceQu', 'MasVnrType', 'Alley']

# Drop the columns (if they exist)
categorical.drop(columns=columns_to_drop, inplace=True, errors='ignore')

In [None]:
categorical.isnull().mean()*100

MSZoning         0.0
Street           0.0
LotShape         0.0
LandContour      0.0
Utilities        0.0
LotConfig        0.0
LandSlope        0.0
Neighborhood     0.0
Condition1       0.0
Condition2       0.0
BldgType         0.0
HouseStyle       0.0
RoofStyle        0.0
RoofMatl         0.0
Exterior1st      0.0
Exterior2nd      0.0
ExterQual        0.0
ExterCond        0.0
Foundation       0.0
BsmtQual         0.0
BsmtCond         0.0
BsmtExposure     0.0
BsmtFinType1     0.0
BsmtFinType2     0.0
Heating          0.0
HeatingQC        0.0
CentralAir       0.0
Electrical       0.0
KitchenQual      0.0
Functional       0.0
GarageType       0.0
GarageFinish     0.0
GarageQual       0.0
GarageCond       0.0
PavedDrive       0.0
SaleType         0.0
SaleCondition    0.0
dtype: float64

In [None]:
categorical.sample(17)

In [None]:
# List of categorical columns with missing values
categorical_cols_with_missing = ['GarageQual', 'GarageCond', 'GarageType', 'GarageFinish',
                                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                'BsmtFinType2', 'Electrical']

# Impute missing values with the mode
for col in categorical_cols_with_missing:
    categorical[col].fillna(categorical[col].mode()[0], inplace=True)

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Handle ordinal features
ordinal_cols = ['LotShape', 'LandSlope', 'ExterQual', 'ExterCond',
                'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional',
                'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive']
ordinal_encoder = OrdinalEncoder()
categorical[ordinal_cols] = ordinal_encoder.fit_transform(categorical[ordinal_cols])

# Handle nominal features
nominal_cols = ['MSZoning', 'Street', 'LandContour', 'Utilities', 'LotConfig',
               'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
               'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
               'Exterior2nd', 'Heating', 'CentralAir', 'Electrical',
               'GarageType', 'SaleType', 'SaleCondition']
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe_array = ohe.fit_transform(categorical[nominal_cols])

# Create a DataFrame from the one-hot encoded array
ohe_df = pd.DataFrame(ohe_array, columns=ohe.get_feature_names_out(nominal_cols))

# Concatenate ordinal and nominal encoded features
encoded_categorical = pd.concat([categorical[ordinal_cols], ohe_df], axis=1)



In [None]:
# Reset indices before concatenating
categorical[ordinal_cols] = categorical[ordinal_cols].reset_index(drop=True)
ohe_df = ohe_df.reset_index(drop=True)

# Concatenate ordinal and nominal encoded features
encoded_categorical = pd.concat([categorical[ordinal_cols], ohe_df], axis=1)

In [None]:
encoded_categorical.head()

Unnamed: 0,LotShape,LandSlope,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,3.0,0.0,2.0,4.0,2.0,3.0,3.0,2.0,5.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3.0,0.0,3.0,4.0,2.0,3.0,1.0,0.0,5.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,2.0,4.0,2.0,3.0,2.0,2.0,5.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,3.0,4.0,3.0,1.0,3.0,0.0,5.0,2.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,2.0,4.0,2.0,3.0,0.0,2.0,5.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
encoded_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 168 entries, LotShape to SaleCondition_Partial
dtypes: float64(152), int64(16)
memory usage: 1.9 MB


In [None]:
for col in ordinal_cols:
  encoded_categorical[col] = encoded_categorical[col].astype('int64')

In [None]:
encoded_categorical.isnull().mean()*100

LotShape                 0.0
LandSlope                0.0
ExterQual                0.0
ExterCond                0.0
BsmtQual                 0.0
                        ... 
SaleCondition_AdjLand    0.0
SaleCondition_Alloca     0.0
SaleCondition_Family     0.0
SaleCondition_Normal     0.0
SaleCondition_Partial    0.0
Length: 168, dtype: float64

In [None]:
rep = ProfileReport(categorical)
rep.to_file(output_file='categorical.html')



Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## **Numerical Features**

In [None]:
numerical.isnull().mean()*100

In [None]:
numerical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1121 entries, 0 to 1459
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   OverallQual   1121 non-null   int64  
 1   YearBuilt     1121 non-null   int64  
 2   YearRemodAdd  1121 non-null   int64  
 3   TotalBsmtSF   1121 non-null   int64  
 4   1stFlrSF      1121 non-null   int64  
 5   GrLivArea     1121 non-null   int64  
 6   FullBath      1121 non-null   int64  
 7   TotRmsAbvGrd  1121 non-null   int64  
 8   GarageYrBlt   1121 non-null   float64
 9   GarageCars    1121 non-null   int64  
 10  GarageArea    1121 non-null   int64  
 11  SalePrice     1121 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 113.9 KB


In [None]:
# Calculate the correlation with 'SalePrice'
correlations = numerical.corrwith(numerical['SalePrice'])

# Filter columns with correlation less than 0.5
columns_to_drop = correlations[correlations < 0.5].index

# Drop the columns from 'numerical'
numerical = numerical.drop(columns=columns_to_drop)

# Display the remaining columns
print(numerical.columns)

In [None]:
numerical.corr()['SalePrice'].sort_values(ascending=False)

In [None]:
numerical.isnull().mean()*100

## **combined_df1**

In [None]:
# Reset indices of both dataframes
numerical = numerical.reset_index(drop=True)
encoded_categorical = encoded_categorical.reset_index(drop=True)

# Concatenate numerical and encoded_categorical DataFrames
combined_df1 = pd.concat([numerical, encoded_categorical], axis=1)
# Display the first few rows of the combined DataFrame
combined_df1.head()

In [None]:
# Calculate the correlation with 'SalePrice'
correlations = combined_df1.corrwith(combined_df1['SalePrice'])

# Filter columns with correlation less than 0.5
columns_to_drop = correlations[correlations < 0.5].index

# Drop the columns from 'combined_df1'
combined_df1 = combined_df1.drop(columns=columns_to_drop)

# Display the remaining columns
print(combined_df1.columns)

In [None]:
combined_df1.corr()['SalePrice'].sort_values(ascending=False)

In [None]:
combined_df1.isnull().mean()*100

OverallQual            23.219178
YearBuilt              23.219178
YearRemodAdd           23.219178
TotalBsmtSF            23.219178
1stFlrSF               23.219178
GrLivArea              23.219178
FullBath               23.219178
TotRmsAbvGrd           23.219178
GarageYrBlt            23.219178
GarageCars             23.219178
GarageArea             23.219178
SalePrice              23.219178
Condition2_RRAe         0.000000
RoofStyle_Shed          0.000000
RoofMatl_ClyTile        0.000000
RoofMatl_Roll           0.000000
Exterior1st_CBlock      0.000000
Exterior1st_ImStucc     0.000000
Exterior2nd_CBlock      0.000000
Heating_Floor           0.000000
Heating_OthW            0.000000
dtype: float64

In [None]:
rep = ProfileReport(combined_df1)
rep.to_file(output_file='combined_df1.html')

# **Model Training**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
X = numerical.drop('SalePrice', axis=1)
y = numerical['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
lr = LinearRegression()

In [None]:
model = lr.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
accuracy_score = model.score(X_test, y_test)
print(f"Accuracy: {accuracy_score:.2f}")

Accuracy: 0.75
