# Question 1 : K fold Cross validation  

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('USA_Housing.csv')
data.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


In [3]:
#a) dividing the columns
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [4]:
#b) scaling the input columns
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 1.02865969, -0.29692705,  0.02127433,  0.08806222, -1.31759867],
       [ 1.00080775,  0.02590164, -0.25550611, -0.72230146,  0.40399945],
       [-0.68462915, -0.11230283,  1.5162435 ,  0.93084045,  0.07240989],
       ...,
       [-0.48723454,  1.28447022, -2.17026949, -1.50025059, -0.29193658],
       [-0.05459152, -0.44669439,  0.14154061,  1.18205319,  0.65111608],
       [-0.28831272,  0.01521477, -0.19434166,  0.07185495,  1.04162464]])

In [5]:
#c) dividing the input and output features into k folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
#d) run five iterations 
beta_list = []
r2_scores = []
for train_index, test_index in kf.split(X):
   
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)
    beta_list.append(model.coef_)

beta_list

[array([230745.94073479, 163243.27314515, 120309.77397759,   3011.45976111,
        151552.63069359]),
 array([229081.97914235, 165882.1605634 , 121536.57475055,   2092.4478622 ,
        150874.99274586]),
 array([230224.50511001, 162766.17455493, 121022.77324577,   1247.16258975,
        150234.77720419]),
 array([229500.10043209, 165212.07110924, 122839.9376815 ,   3063.71699324,
        150917.88484984]),
 array([230225.0513193 , 163956.83884606, 121115.12045628,    783.46716975,
        150662.44678192])]

In [9]:
#e) using the best value of beta matrix to train
best_beta = beta_list[np.argmax(r2_scores)]
print(best_beta)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R2 score for test data using all features:", r2)


model = LinearRegression()
model.fit(X_train.dot(best_beta.reshape(-1, 1)), y_train)
y_pred = model.predict(X_test.dot(best_beta.reshape(-1, 1)))
r2 = r2_score(y_test, y_pred)
print("R2 score for test data using best features:", r2)

[230225.0513193  163956.83884606 121115.12045628    783.46716975
 150662.44678192]
R2 score for test data using all features: 0.9146818498916267
R2 score for test data using best features: 0.9147257541979954


# Question 2 : Validation set for Multiple Linear Regression (Gradient Descent Optimization)

In [10]:
X_train , X_test , y_train , y_test = train_test_split(X,y, test_size = 0.3 , random_state = 42)
X_train , X_val , y_train , y_val = train_test_split(X_train , y_train ,test_size = 0.2 , random_state = 42)


In [11]:
def gradient_descent(X, y, learning_rate, num_iterations):
    m = X.shape[0]
    n = X.shape[1]
    beta = np.zeros(n)
    for i in range(num_iterations):
        y_pred = X.dot(beta)
        error = y_pred - y
        gradient = (1/m) * X.T.dot(error)
        beta = beta - learning_rate * gradient
    return beta

In [12]:
learning_rates = [0.001, 0.01, 0.1, 1]

In [13]:
best_beta = None
best_r2 = -np.inf
for learning_rate in learning_rates:
    beta = gradient_descent(X_train, y_train, learning_rate, num_iterations=1000)

    y_val_pred = X_val.dot(beta)
    r2_val = r2_score(y_val, y_val_pred)

    y_test_pred = X_test.dot(beta)
    r2_test = r2_score(y_test, y_test_pred)

   #Check if this set of coefficients is the best so far
    if r2_val > best_r2:
        best_beta = beta
        best_r2 = r2_val

    print(f"Learning rate: {learning_rate:.3f}\n"
          f"Coefficients: {beta}\n"
          f"R2 score on validation set: {r2_val:.3f}\n"
          f"R2 score on test set: {r2_test:.3f}\n")

Learning rate: 0.001
Coefficients: [144315.71298955 100846.73121722  87914.71502389  39653.61270937
  84711.71017358]
R2 score on validation set: -11.339
R2 score on test set: -12.187

Learning rate: 0.010
Coefficients: [225260.73363469 160480.64158588 141797.51720382  22997.93487203
 135067.67385827]
R2 score on validation set: -11.318
R2 score on test set: -12.011

Learning rate: 0.100
Coefficients: [225276.8914395  160502.91856057 142077.24345278  22712.97277526
 135062.01202153]
R2 score on validation set: -11.318
R2 score on test set: -12.011

Learning rate: 1.000
Coefficients: [225276.8914395  160502.91856057 142077.24345278  22712.97277526
 135062.01202153]
R2 score on validation set: -11.318
R2 score on test set: -12.011



In [14]:
print(f"Best coefficients: {best_beta}\n"
      f"Best R2 score on validation set: {best_r2:.3f}")

Best coefficients: [225260.73363469 160480.64158588 141797.51720382  22997.93487203
 135067.67385827]
Best R2 score on validation set: -11.318


# Question 3 : Pre processing and MLR

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [16]:
data = pd.read_csv('imports-85.csv' , header = None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [17]:
my_cols = ["symboling","normalized_losses","make","fuel_type","aspiration","num_doors","body_style","drive_wheels","engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]
data.columns = my_cols
data.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [24]:
#2) Replace all NaN values with central tendency imputation. Drop the rows with NaN
for values in data["price"]:
    data = data.replace('?', np.nan)

data.head()

imputer = SimpleImputer(strategy='most_frequent')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

data.dropna(subset=['price'], inplace=True)

In [25]:
#3) 3. There are 10 columns in the dataset with non-numeric values. Convert these values to
# numeric values using following scheme:

In [26]:
# (i) For “num_doors” and “num_cylinders”: convert words (number names) to figures
# for e.g., two to 2

In [27]:
data['num_doors'] = data['num_doors'].replace({'four':4, 'two':2})
data['num_cylinders'] = data['num_cylinders'].replace({'four':4, 'six':6, 'five':5, 'eight':8, 'two':2, 'three':3, 'twelve':12})

In [28]:
# (ii) For "body_style";, "drive_wheels";: use dummy encoding scheme

In [29]:
data = pd.get_dummies(data, columns=['body_style', 'drive_wheels'])

In [30]:
# (iii) For “make”, “aspiration”, “engine_location”,fuel_type: use label encoding
# scheme

In [31]:
label_encoders = {}
for col in ['make', 'aspiration', 'engine_location', 'fuel_type']:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

In [32]:
# (iv) For fuel_system: replace values containing string pfi to 1 else all values to 0.

In [33]:
data['fuel_system'] = np.where(data['fuel_system'] == 'pfi', 1, 0)

In [34]:
# (v) For engine_type: replace values containing string ohc to 1 else all values to 0.

In [35]:
data['engine_type'] = np.where(data['engine_type'].str.contains('ohc'), 1, 0)

In [36]:
#4. Divide the dataset into input features (all columns except price) and output variable(price). Scale all input features.

In [37]:
X = data.drop(columns=["price"])
y = data["price"]

# Scale all input features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)



In [38]:
# Divide the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train a linear regressor on 70% of data and test its performance on remaining 30% of data
regressor = LinearRegression()
regressor.fit(X_train, y_train)

train_score = regressor.score(X_train, y_train)
test_score = regressor.score(X_test, y_test)

print(f"Train R2 Score: {train_score:.2f}")
print(f"Test R2 Score: {test_score:.2f}")

Train R2 Score: 0.91
Test R2 Score: 0.79


In [48]:
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pca = PCA(n_components=6)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a linear regression model on 70% of the reduced data
lr = LinearRegression()
lr.fit(X_train_pca, y_train)

# Evaluate the performance of the model on the test set
y_pred = lr.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)

print("Mean Squared Error on test set:", mse)


Mean Squared Error on test set: 34.57700369464132
