# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
pd.set_option('display.max_rows', None)
print("Missing values in each column:\n", train.isnull().sum())

Missing values in each column:
 Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType        872
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinSF1          0
BsmtFinType2       38
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0


In [6]:
train.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature', 'MasVnrType', 'FireplaceQu'], axis=1, inplace=True)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [8]:
name_of_coll = train.drop(columns="Id").select_dtypes(include=['number']).columns
for col in name_of_coll:
    nan_indices = train[col].isnull()  # Find the indices of NaN
    random_samples = train[col].dropna().sample(n=nan_indices.sum(), replace=True)  # Sample of the column without NaN
    train.loc[nan_indices, col] = random_samples.values

In [9]:
col_has_numbers = train.drop(columns="Id").select_dtypes(include=['number'])
col_has_numbers.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
MSSubClass,0
HalfBath,0
KitchenAbvGr,0
TotRmsAbvGrd,0
Fireplaces,0
GarageYrBlt,0
GarageCars,0
GarageArea,0
WoodDeckSF,0
OpenPorchSF,0


In [10]:
name_of_coll = train.drop(columns="Id").select_dtypes(include=['object']).columns

for col in name_of_coll:
    # Using mode()
    mode_for_coll = train[col].mode()[0]
    train[col].fillna(mode_for_coll, inplace=True)

In [11]:
col_has_objects = train.drop(columns="Id").select_dtypes(include=['object'])
col_has_objects.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
MSZoning,0
BsmtQual,0
BsmtExposure,0
BsmtFinType1,0
BsmtFinType2,0
Heating,0
HeatingQC,0
CentralAir,0
Electrical,0
KitchenQual,0


In [12]:
# Assuming 'col_has_objects' from previous cells holds the object columns
obj_col = col_has_objects.columns

encoder = LabelEncoder()
for col in obj_col: # Modified: Removed .values.flatten() - Not necessary when iterating a pandas index
    train[col] = encoder.fit_transform(train[col])

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int64  
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   int64  
 6   LotShape       1460 non-null   int64  
 7   LandContour    1460 non-null   int64  
 8   Utilities      1460 non-null   int64  
 9   LotConfig      1460 non-null   int64  
 10  LandSlope      1460 non-null   int64  
 11  Neighborhood   1460 non-null   int64  
 12  Condition1     1460 non-null   int64  
 13  Condition2     1460 non-null   int64  
 14  BldgType       1460 non-null   int64  
 15  HouseStyle     1460 non-null   int64  
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

###Outliers

In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns

import plotly.graph_objects as go
from plotly.subplots import make_subplots

numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
num_cols = len(numeric_cols)

fig = make_subplots(rows=(num_cols + 4) // 5, cols=min(num_cols, 5), subplot_titles=numeric_cols)

for i, col in enumerate(numeric_cols, 1):
    fig.add_trace(go.Box(y=train[col], name=col), row=(i-1)//5 + 1, col=(i-1) % 5 + 1)

fig.update_layout(height=((num_cols + 4) // 5) * 300, width=min(num_cols, 5) * 300, title="Boxplots of Numeric Features")

fig.show()


In [15]:
# Function to detect and handle outliers using IQR
def handle_outliers_iqr(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = np.clip(df[column], lower_bound, upper_bound)
  return df

# Iterate through numeric columns and handle outliers
for col in numeric_cols:
  train = handle_outliers_iqr(train, col)

# You can visualize the boxplots again to see the effect of outlier handling
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
num_cols = len(numeric_cols)

fig = make_subplots(rows=(num_cols + 4) // 5, cols=min(num_cols, 5), subplot_titles=numeric_cols)

for i, col in enumerate(numeric_cols, 1):
    fig.add_trace(go.Box(y=train[col], name=col), row=(i-1)//5 + 1, col=(i-1) % 5 + 1)

fig.update_layout(height=((num_cols + 4) // 5) * 300, width=min(num_cols, 5) * 300, title="Boxplots of Numeric Features (After Outlier Handling)")

fig.show()


# Split & Select Features

In [16]:
X = train.iloc[:, :-1].values
y = train.iloc[:, -1:].values

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [18]:
# StandardScaler
scaler = StandardScaler(copy=True, with_std=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_train, y_train, cv=cv)
print("Cross-validated scores:", scores)
print(f"Mean Cross-validated-accuracy: {scores.mean()}")

Cross-validated scores: [0.83121101 0.87687749 0.89276136 0.90445313 0.88483868]
Mean Cross-validated-accuracy: 0.87802833381508


In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Import accuracy_score for classification tasks
from sklearn.metrics import accuracy_score

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 414887678.2517273
R-squared: 0.894774310569424
