# PART A – Regression modelling for business decision making

In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

In [None]:
# Loading the dataset. The dataset is uploaded in the GoogleCollab.
df = pd.read_csv('House_price.csv')

# Task-1 Data Preprocessing

a) Properly clean the dataset, handle any missing values, and remove outliers

In [None]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,Avg. Area Income,House Age,Number of Rooms,Number of Bedrooms,Area Population,Price,Address
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0,USS Barnett\nFPO AP 44820
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5,USNS Raymond\nFPO AE 09386


In [None]:
# Structure of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4548 entries, 0 to 4547
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Avg. Area Income    4548 non-null   float64
 1   House Age           4548 non-null   float64
 2   Number of Rooms     4548 non-null   float64
 3   Number of Bedrooms  4548 non-null   float64
 4   Area Population     4548 non-null   float64
 5   Price               4548 non-null   float64
 6   Address             4548 non-null   object 
dtypes: float64(6), object(1)
memory usage: 248.8+ KB


In [None]:
# Descriptive statistics for the dataset
df.describe()

Unnamed: 0,Avg. Area Income,House Age,Number of Rooms,Number of Bedrooms,Area Population,Price
count,4548.0,4548.0,4548.0,4548.0,4548.0,4548.0
mean,68611.700818,5.978918,6.987646,3.981693,36187.469334,1233916.0
std,10686.487761,0.99085,1.006587,1.230939,9910.189915,354567.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61485.150192,5.332187,6.299692,3.14,29423.16351,997775.1
50%,68817.036575,5.960872,7.002245,4.05,36215.560985,1234571.0
75%,75820.741747,6.658368,7.665871,4.49,42880.554642,1470616.0
max,107701.7484,9.519088,10.759588,6.5,69592.04024,2469066.0


In [None]:
# Display missing values in the dataset
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Avg. Area Income      0
House Age             0
Number of Rooms       0
Number of Bedrooms    0
Area Population       0
Price                 0
Address               0
dtype: int64


In [None]:
import matplotlib.pyplot as plt

In [None]:
# Removing the outliers from the dataset
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [None]:
# Applying the function to relevant numerical columns
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df = remove_outliers(df, column)

In [None]:
# Verify the changes
df.describe()

Unnamed: 0,Avg. Area Income,House Age,Number of Rooms,Number of Bedrooms,Area Population,Price
count,4415.0,4415.0,4415.0,4415.0,4415.0,4415.0
mean,68625.56201,5.977806,6.987266,3.984598,36159.676459,1233401.0
std,10306.583987,0.964864,0.983385,1.231457,9612.957623,337739.3
min,40141.56648,3.342599,4.290699,2.0,9487.921585,302307.4
25%,61601.80936,5.338686,6.308265,3.15,29498.46774,1003740.0
50%,68854.09085,5.959945,7.002164,4.05,36183.2878,1234037.0
75%,75732.554225,6.647182,7.661763,4.49,42771.784635,1464820.0
max,97112.36125,8.641821,9.710217,6.5,62963.75539,2152959.0


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Initialize the scaler
scaler = StandardScaler()

In [None]:
# Apply scaling to numerical columns
df[df.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(df.select_dtypes(include=['float64', 'int64']).values)

In [None]:
# Verify the scaling
df.describe()

Unnamed: 0,Avg. Area Income,House Age,Number of Rooms,Number of Bedrooms,Area Population,Price
count,4415.0,4415.0,4415.0,4415.0,4415.0,4415.0
mean,-7.725040000000001e-17,1.72204e-16,-6.920348000000001e-17,7.50375e-17,-5.825968e-16,4.8281500000000006e-17
std,1.000113,1.000113,1.000113,1.000113,1.000113,1.000113
min,-2.763983,-2.731478,-2.742437,-1.611767,-2.774877,-2.757153
25%,-0.6815593,-0.662469,-0.6905519,-0.6778087,-0.6930191,-0.6800709
50%,0.0221756,-0.01851424,0.0151511,0.05311548,0.002456478,0.001884563
75%,0.6896366,0.6938296,0.6859703,0.4104562,0.6879108,0.6852773
max,2.764255,2.761338,2.76927,2.042854,2.788644,2.722996


In [None]:
# Perform one-hot encoding on categorical columns
df = pd.get_dummies(df, drop_first=True)

In [None]:
# Verify the encoding
df.head()

Unnamed: 0,Avg. Area Income,House Age,Number of Rooms,Number of Bedrooms,Area Population,Price,"Address_000 Todd Pines\nAshleyberg, KY 90207-1179","Address_001 Steve Plaza\nJessicastad, UT 25190","Address_0010 Gregory Loaf\nSouth Ericfort, VA 34651-0718","Address_00149 Raymond Knolls\nNew Jason, UT 75026",...,Address_Unit 9446 Box 0958\nDPO AE 97025,Address_Unit 9463 Box 0963\nDPO AE 49984-2796,Address_Unit 9494 Box 2307\nDPO AE 58622,Address_Unit 9664 Box 1605\nDPO AA 30902,Address_Unit 9732 Box 1846\nDPO AE 69898-3304,Address_Unit 9774 Box 4511\nDPO AE 44963,Address_Unit 9778 Box 2114\nDPO AP 59374,Address_Unit 9785 Box 0790\nDPO AP 60371-0797,Address_Unit 9831 Box 7128\nDPO AA 54705,Address_Unit 9871 Box 9037\nDPO AP 37275-9289
0,1.059627,-0.30572,0.022295,0.085601,-1.360076,-0.516336,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1.030825,0.02601,-0.260808,-0.726537,0.417546,0.806897,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,-0.712101,-0.116005,1.55141,0.930225,0.075166,-0.516471,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,-0.512383,1.25465,-1.424362,-0.588474,-0.192411,0.080592,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,-0.838721,-0.971492,0.866617,0.1993,-1.020152,-1.783996,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Target variable is 'Price'
X = df.drop('Price', axis=1)
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3532, 4419), (883, 4419), (3532,), (883,))

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

In [None]:
# Initialize the model
ridge = Ridge()


In [None]:
# Define the hyperparameters grid
param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr']
}


In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_absolute_error')

In [None]:
# Fit the grid search
grid_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

NameError: name 'grid_search' is not defined

In [None]:
import pandas as pd

In [None]:
# Loading the dataset
df1 = pd.read_csv('income.csv')

In [None]:
# Display the first few rows and summary statistics
print(df1.head())
print(df1.info())
print(df1.describe(include='all'))

   age       JobType         EdType        maritalstatus       occupation  \
0   45       Private        HS-grad             Divorced     Adm-clerical   
1   24   Federal-gov        HS-grad        Never-married     Armed-Forces   
2   44       Private   Some-college   Married-civ-spouse   Prof-specialty   
3   27       Private            9th        Never-married     Craft-repair   
4   20       Private   Some-college        Never-married            Sales   

      relationship    race   gender  capitalgain  capitalloss  hoursperweek  \
0    Not-in-family   White   Female            0            0            28   
1        Own-child   White     Male            0            0            40   
2          Husband   White     Male            0            0            40   
3   Other-relative   White     Male            0            0            40   
4    Not-in-family   White     Male            0            0            35   

    nativecountry                        SalStat  
0   United-

In [None]:
# Checking for missing values
missing_values = df1.isnull().sum()
print("Missing values in each column:\n", missing_values)


Missing values in each column:
 age              0
JobType          0
EdType           0
maritalstatus    0
occupation       0
relationship     0
race             0
gender           0
capitalgain      0
capitalloss      0
hoursperweek     0
nativecountry    0
SalStat          0
dtype: int64


In [None]:
def remove_outliers_using_iqr(df1, column):
    Q1 = df1[column].quantile(0.25)
    Q3 = df1[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Use df1[column] for the boolean indexing to ensure alignment
    return df1[(df1[column] >= lower_bound) & (df1[column] <= upper_bound)]

# Apply the function to relevant numerical columns
numerical_columns = df1.select_dtypes(include=['float64', 'int64']).columns
for column in numerical_columns:
    df1 = remove_outliers_using_iqr(df1, column)

# Verify the changes
print(df1.describe())

                age  capitalgain  capitalloss  hoursperweek
count  13149.000000      13149.0      13149.0       13149.0
mean      37.923112          0.0          0.0          40.0
std       12.258415          0.0          0.0           0.0
min       17.000000          0.0          0.0          40.0
25%       28.000000          0.0          0.0          40.0
50%       36.000000          0.0          0.0          40.0
75%       47.000000          0.0          0.0          40.0
max       75.000000          0.0          0.0          40.0


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to numerical columns
df1[numerical_columns] = scaler.fit_transform(df1[numerical_columns])

# Verify the scaling
print(df1.describe)

<bound method NDFrame.describe of             age       JobType         EdType        maritalstatus  \
1     -1.135844   Federal-gov        HS-grad        Never-married   
2      0.495751       Private   Some-college   Married-civ-spouse   
3     -0.891104       Private            9th        Never-married   
5      0.495751       Private        HS-grad              Widowed   
6      1.066809       Private        HS-grad   Married-civ-spouse   
...         ...           ...            ...                  ...   
31968 -1.706902             ?           11th        Never-married   
31974 -0.320046     Local-gov   Some-college        Never-married   
31975 -1.217423       Private   Some-college   Married-civ-spouse   
31976  0.332591     Local-gov   Some-college   Married-civ-spouse   
31977 -0.727945       Private      Bachelors        Never-married   

             occupation     relationship                 race   gender  \
1          Armed-Forces        Own-child                White  

In [None]:
# Perform one-hot encoding on categorical columns
df1 = pd.get_dummies(df1, drop_first=True)

# Verify the encoding
print(df1.head())

        age  capitalgain  capitalloss  hoursperweek  JobType_ Federal-gov  \
1 -1.135844          0.0          0.0           0.0                  True   
2  0.495751          0.0          0.0           0.0                 False   
3 -0.891104          0.0          0.0           0.0                 False   
5  0.495751          0.0          0.0           0.0                 False   
6  1.066809          0.0          0.0           0.0                 False   

   JobType_ Local-gov  JobType_ Never-worked  JobType_ Private  \
1               False                  False             False   
2               False                  False              True   
3               False                  False              True   
5               False                  False              True   
6               False                  False              True   

   JobType_ Self-emp-inc  JobType_ Self-emp-not-inc  ...  \
1                  False                      False  ...   
2                  F

In [None]:
from sklearn.model_selection import train_test_split

# Target variable is the last column and is categorical
X = df1.drop(df1.columns[-1], axis=1)
y = df1[df1.columns[-1]]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(10519, 96) (2630, 96) (10519,) (2630,)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV



In [None]:
# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy')


In [None]:
# Fit the grid search
grid_search.fit(X_train, y_train)


In [None]:
# Print the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: 0.8608235745060921


In [None]:
# Build the final model with the best parameters
rf_best = RandomForestClassifier(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

# Verify model performance on the training set
print(rf_best.score(X_train, y_train))

0.8842095256203061
