In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('USA_Housing.csv')

In [3]:
df.head(5)

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [4]:
x = df.drop('Price',axis=1).values
y = df['Price'].values.reshape(-1,1)

In [5]:
print(x)


[[7.95454586e+04 5.68286132e+00 7.00918814e+00 4.09000000e+00
  2.30868005e+04]
 [7.92486424e+04 6.00289981e+00 6.73082102e+00 3.09000000e+00
  4.01730722e+04]
 [6.12870672e+04 5.86588984e+00 8.51272743e+00 5.13000000e+00
  3.68821594e+04]
 ...
 [6.33906869e+04 7.25059062e+00 4.80508098e+00 2.13000000e+00
  3.32661455e+04]
 [6.80013312e+04 5.53438842e+00 7.13014386e+00 5.44000000e+00
  4.26256202e+04]
 [6.55105818e+04 5.99230531e+00 6.79233610e+00 4.07000000e+00
  4.65012838e+04]]


In [6]:
print(y)

[[1059033.558]
 [1505890.915]
 [1058987.988]
 ...
 [1030729.583]
 [1198656.872]
 [1298950.48 ]]


In [7]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [8]:
print(x_scaled)

[[ 1.02865969 -0.29692705  0.02127433  0.08806222 -1.31759867]
 [ 1.00080775  0.02590164 -0.25550611 -0.72230146  0.40399945]
 [-0.68462915 -0.11230283  1.5162435   0.93084045  0.07240989]
 ...
 [-0.48723454  1.28447022 -2.17026949 -1.50025059 -0.29193658]
 [-0.05459152 -0.44669439  0.14154061  1.18205319  0.65111608]
 [-0.28831272  0.01521477 -0.19434166  0.07185495  1.04162464]]


In [9]:
def least_square_fit(x_train, y_train):
    ones = np.ones((x_train.shape[0],1))
    x_train_bias = np.hstack((ones,x_train))

    beta = np.linalg.inv(x_train_bias.T @ x_train_bias) @ (x_train_bias.T @ y_train)
    return beta

In [10]:
def predict(x, beta):
    ones = np.ones((x.shape[0],1))
    x_bias = np.hstack((ones,x))
    
    return x_bias @ beta

In [12]:
kF = KFold(n_splits=5, shuffle = True, random_state = 42)

betas = []
r2_scores = []

for train_index, test_index in kF.split(x_scaled):
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index] , y[test_index]

    beta = least_square_fit(x_train, y_train)

    y_pred = predict(x_test, beta)

    score = r2_score(y_test, y_pred)

    betas.append(beta)
    r2_scores.append(score)

for i, score in enumerate(r2_scores):
    print(f"Fold {i+1} : R2 = {score:.4f}")

best_fold = np.argmax(r2_scores)
best_beta = betas[best_fold]

print("best fold:",best_fold+1)
print("best r2 score",r2_scores[best_fold])


    






Fold 1 : R2 = 0.9180
Fold 2 : R2 = 0.9146
Fold 3 : R2 = 0.9116
Fold 4 : R2 = 0.9193
Fold 5 : R2 = 0.9244
best fold: 5
best r2 score 0.9243869413350317


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.3, random_state=42)

In [14]:
y_pred_final = predict(x_test,best_beta)

In [15]:
y_pred_final

array([[1309472.34209342],
       [1238698.51255206],
       [1247440.13320666],
       ...,
       [1458529.22587541],
       [1482180.096334  ],
       [1050564.30627078]], shape=(1500, 1))

In [16]:
final_r2 = r2_score(y_test, y_pred_final)
print("Final r2 score: ", final_r2)

Final r2 score:  0.9147458156636434


In [17]:
#Q2

x_scaled

array([[ 1.02865969, -0.29692705,  0.02127433,  0.08806222, -1.31759867],
       [ 1.00080775,  0.02590164, -0.25550611, -0.72230146,  0.40399945],
       [-0.68462915, -0.11230283,  1.5162435 ,  0.93084045,  0.07240989],
       ...,
       [-0.48723454,  1.28447022, -2.17026949, -1.50025059, -0.29193658],
       [-0.05459152, -0.44669439,  0.14154061,  1.18205319,  0.65111608],
       [-0.28831272,  0.01521477, -0.19434166,  0.07185495,  1.04162464]],
      shape=(5000, 5))

In [20]:
x_scaled1 = np.hstack((np.ones((x_scaled.shape[0],1)),x_scaled))

In [21]:
x_train1, x_temp, y_train1, y_temp = train_test_split(x_scaled1, y, test_size = 0.44, random_state=42)
x_val, x_test1, y_val, y_test1 = train_test_split(x_temp, y_temp, test_size = 30/44, random_state=42)


In [22]:
print("train shape:", x_train1.shape)
print("validation shape:", x_val.shape)
print("testing shape:", x_test1.shape)

train shape: (2800, 6)
validation shape: (700, 6)
testing shape: (1500, 6)


In [23]:
def gradient_descent(x,y,lr,iterations):
    m,n = x.shape
    beta = np.zeros((n,1))
    for i in range(iterations):
        y_pred1 = x @ beta
        error = y_pred1 - y
        gradient = (1/m) * (x.T @ error)
        beta -= lr * gradient
    return beta


In [None]:
learning_rates = [0.001,0.01,0.1,1]
results={}

for lr in learning_rates:
    beta = gradient_descent(x_train1, y_train1, lr, 1000)

    y_val_pred = x_val @ beta
    y_test_pred = x_test1 @ beta

    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    results[lr] = {
        "beta" : beta,
        "R2_val" : r2_val,
        "R2_test" : r2_test
    }

    best_lr = max(results, key = lambda k: results[k]["R2_val"])
    best_beta = results[best_lr]["beta"]

In [25]:
print("\nBest Learning Rate:", best_lr)
print("Best Beta coefficients:\n", best_beta)
print("Validation R2:", results[best_lr]["R2_val"])
print("Test R2:", results[best_lr]["R2_test"])


Best Learning Rate: 0.1
Best Beta coefficients:
 [[1232180.27200919]
 [ 230645.88389435]
 [ 165328.94019375]
 [ 120045.00851908]
 [   2945.02108903]
 [ 151375.22971285]]
Validation R2: 0.9199649194854793
Test R2: -0.9507611866574124


In [None]:
#Q3

In [6]:
column_names = ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

In [25]:
autos = pd.read_csv('autos.csv', header=None, names=column_names)
autos.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [26]:
autos = autos.replace('?',np.nan)

In [27]:
autos = autos.dropna(subset=['price'])

In [28]:
autos['num_cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [29]:
autos['num_doors'].unique()

array(['two', 'four', nan], dtype=object)

In [30]:
num_word_map = {
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12
}

In [31]:
for col in ["num_doors","num_cylinders"]:
    autos[col] = autos[col].map(lambda x : num_word_map.get(str(x).lower(), np.nan))

In [32]:
autos["num_cylinders"]

0      4
1      4
2      6
3      4
4      5
      ..
200    4
201    4
202    6
203    6
204    4
Name: num_cylinders, Length: 201, dtype: int64

In [33]:
numeric_like_cols = [
"symboling", "normalized_losses", "wheel_base", "length", "width", "height",
"curb_weight", "engine_size", "bore", "stroke", "compression_ratio",
"horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price",
"num_doors", "num_cylinders"
]

In [34]:
for col in numeric_like_cols:
    autos[col] = pd.to_numeric(autos[col], errors = 'coerce')

In [35]:
autos = pd.get_dummies(autos, columns = ["body_style","drive_wheels"], drop_first=True)

In [38]:
autos["make"].unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
       'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',
       'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype=object)

In [39]:
from sklearn.preprocessing import LabelEncoder

In [40]:
label_cols = ["make","aspiration","engine_location","fuel_type"]
for col in label_cols:
    le = LabelEncoder()
    autos[col] = le.fit_transform(autos[col].astype(str))

In [41]:
autos["make"]

0       0
1       0
2       0
3       1
4       1
       ..
200    21
201    21
202    21
203    21
204    21
Name: make, Length: 201, dtype: int64

In [42]:
autos["fuel_system"] = autos["fuel_system"].astype(str).str.contains("pfi", case=False, na=False).astype(int)

In [43]:
autos["engine_type"] = autos["engine_type"].astype(str).str.contains("ohc", case=False, na=False).astype(int)

In [44]:
for col in autos.columns:
    if autos[col].dtype.kind in "biufc":
        med = autos[col].median()
        autos[col] = autos[col].fillna(med)
    else:
        mode_val = autos[col].mode(dropna=True)
        mode_val = mode_val.iloc[0] if not mode_val.empty else np.nan
        autos[col] = autos[col].fillna(mode_val)

In [45]:
X = autos.drop(columns=["price"])
y = autos["price"].astype(float)

In [46]:
X_train, X_test, y_train2, y_test2 = train_test_split(X, y, test_size=0.30, random_state = 42)

In [47]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [49]:
lin = LinearRegression()
lin.fit(X_train_scaled, y_train2)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [50]:
y_pred2 = lin.predict(X_test_scaled)


In [52]:
r2_baseline = r2_score(y_test2, y_pred2)
mse_baseline = mean_squared_error(y_test2, y_pred2)
rmse_baseline = np.sqrt(mse_baseline)

print("r2 score : ", r2_baseline)
print("rmse : ", rmse_baseline)

r2 score :  0.8734104772978124
rmse :  3464.0483212516906


In [53]:
pca = PCA(n_components=0.95, random_state=42)


In [54]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


In [56]:
lin_pca = LinearRegression()
lin_pca.fit(X_train_pca, y_train2)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [58]:
y_pred_pca = lin_pca.predict(X_test_pca)
r2_pca = r2_score(y_test2, y_pred_pca)
mse_pca = mean_squared_error(y_test2, y_pred_pca)
rmse_pca = np.sqrt(mse_pca)


print("# components:", pca.n_components_)
print("R2:", r2_pca)
print("RMSE:", rmse_pca)

# components: 16
R2: 0.8569777462361792
RMSE: 3682.0261539822177
