## Part 1
### 

In [185]:
import pandas as pd 
import numpy as np
import tqdm
import matplotlib.pyplot as plt

In [186]:
df = pd.read_csv('../data/1.csv')

In [238]:
class MyLinearRegression:
    def __init__(self):
        self.w = None
        
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, lamb=0.1, gd = True):
        rows, cols = X.shape
        if not gd:
            self.w = np.linalg.inv(X.T @ X) @ X.T @ y
        else:
            self.w = np.zeros(cols)
            for _ in tqdm.tqdm(range(epochs)):
                predicts = X @ self.w
                errors = (predicts - y)
                reg = lamb * np.sign(self.w)
                reg[0] = 0
                gradient = X.T @ errors / rows + reg
                update_term = learning_rate  *  gradient
                self.w -= update_term
                
            
        
    def predict(self, X: np.ndarray):
        return X @ self.w

    def score(self, X: np.ndarray, y: np.array):
        y_pred = self.predict(X)
        return  1 - ((y - y_pred)**2).sum() / ((y - y.mean())**2).sum()


class PolynomialRegression: 
    def __init__(self, degree=2, epochs = 100):
        self.degree = degree
        self.epochs = epochs
        self.powers = None
        self.lr = MyLinearRegression()
    
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, gd = True):
        X = self.add_bias(X)
        self.powers = self.get_power_combinations(X.shape[1], self.degree)
        X = self.get_pr_features(X, self.powers)
        self.lr.fit(X, y, epochs=epochs, learning_rate=learning_rate, gd=gd)
        
    def score(self, X: np.ndarray, y: np.array):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return self.lr.score(X, y)
    
    
    def predict(self, X: np.ndarray):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return X @ self.lr.w
        
    def add_bias(self, X: np.ndarray): 
        return np.column_stack((np.ones(X.shape[0]) , X))
        
    
    def get_pr_features(self, X: np.ndarray, powers: np.ndarray = None):
        if powers is None:
            powers = self.get_power_combinations(X.shape[1], self.degree)
        return np.prod(np.power(X[:, np.newaxis], powers), axis=-1)


    def get_power_combinations(self, n: int, m: int): 
        if n == 1:
            return [[m]]
        if m == 0:
            return [[0 for _ in range(n)]]
        sols = []
        for i in range(m+1):
            for sol in self.get_power_combinations(n-1, m - i):
                sols.append(
                    [i] + sol
                )
        return sols

In [239]:
def train_test_split_manual(df, test_size=0.2, random_state=None, shuffle=True):
    if shuffle:
        df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    if isinstance(test_size, float):
        test_size = int(len(df) * test_size)

    train_df = df.iloc[:-test_size].reset_index(drop=True)
    test_df = df.iloc[-test_size:].reset_index(drop=True)

    return train_df, test_df


In [240]:
NUMERICAL_FEATURES = [
    'Rooms', 'Distance', 'Bedroom2', 'Postcode', 'Bathroom', 'Car',
    'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount',
]
CATEGORICAL_FEATURES = [
    'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Date'
]
PREDICT = 'Price'

train, test = train_test_split_manual(df, test_size=.2, random_state=40, shuffle=True)

In [241]:
def normalize_df(matrix: np.ndarray):
    return (matrix - matrix.min(axis=0)) / (matrix.max(axis=0) - matrix.min(axis=0))
def standardize_df(matrix: np.ndarray):
    return (matrix - matrix.mean(axis=0)) / matrix.std(axis=0)

In [242]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor

In [243]:
pr2 = PolynomialFeatures(degree=3)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
gdr = SGDRegressor(
    loss='squared_error', 
    # penalty='l2', 
    # alpha=0.01, 
    # l1_ratio=0.15, 
    # fit_intercept=True, 
    max_iter=100, 
    # tol=0.001, 
    # shuffle=False, 
    # verbose=0, 
    # epsilon=0.1, 
    # random_state=None, 
    learning_rate='constant', 
    # eta0=0.01, 
    # power_t=0.25, 
    # early_stopping=False, 
    # validation_fraction=0.1, 
    # n_iter_no_change=5, 
    # warm_start=False, 
    # average=False
)
lr = LinearRegression()


train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])

train_numeric = normalize_df(train_numeric)
# train_numeric = standardize_df(train_numeric)

train_numeric = pr2.fit_transform(train_numeric)
y = train[PREDICT].values
gdr.fit(train_numeric, y)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])

test_numeric = normalize_df(test_numeric)
# test_numeric = standardize_df(test_numeric)

test_numeric = pr2.fit_transform(test_numeric)

y_test = test[PREDICT].values
gdr.score(test_numeric, y_test)

0.46632168560846643

In [258]:
pr1 = PolynomialRegression(degree=4)
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])

# train_numeric = normalize_df(train_numeric)
train_numeric = standardize_df(train_numeric)

y = train[PREDICT].values
pr1.fit(train_numeric, y, epochs=100, learning_rate=.01, gd=True)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])

# test_numeric = normalize_df(test_numeric)
test_numeric = standardize_df(test_numeric)

y_test = test[PREDICT].values
pr1.score(test_numeric, y_test)

100%|██████████| 100/100 [00:01<00:00, 92.99it/s]


np.float64(nan)

In [253]:
imp.fit_transform(train[NUMERICAL_FEATURES])

array([[ 2.0000000e+00,  1.7200000e+01,  2.0000000e+00, ...,
        -3.7820470e+01,  1.4519421e+02,  6.8710000e+03],
       [ 4.0000000e+00,  9.7000000e+00,  3.0000000e+00, ...,
        -3.7736700e+01,  1.4492040e+02,  3.2840000e+03],
       [ 3.0000000e+00,  1.0700000e+01,  3.0000000e+00, ...,
        -3.7918500e+01,  1.4501380e+02,  6.9380000e+03],
       ...,
       [ 4.0000000e+00,  3.4000000e+00,  4.0000000e+00, ...,
        -3.7784740e+01,  1.4493220e+02,  3.5930000e+03],
       [ 4.0000000e+00,  8.9000000e+00,  4.0000000e+00, ...,
        -3.7732260e+01,  1.4509855e+02,  2.6980000e+03],
       [ 3.0000000e+00,  2.1300000e+01,  3.0000000e+00, ...,
        -3.7827070e+01,  1.4522914e+02,  3.7940000e+03]])

In [254]:
standardize_df(imp.fit_transform(train[NUMERICAL_FEATURES]))

array([[-0.97763257,  1.20141904, -0.94131955, ..., -0.14284488,
         1.90989459, -0.12931496],
       [ 1.1276202 , -0.07171123,  0.09732514, ...,  0.91664815,
        -0.71053867, -0.94735935],
       [ 0.07499382,  0.09803947,  0.09732514, ..., -1.38269329,
         0.18332374, -0.11403507],
       ...,
       [ 1.1276202 , -1.14114066,  1.13596982, ...,  0.30905539,
        -0.59760959, -0.87688939],
       [ 1.1276202 , -0.2075118 ,  1.13596982, ...,  0.97280368,
         0.99440339, -1.08100139],
       [ 0.07499382,  1.89739692,  0.09732514, ..., -0.22631932,
         2.24418381, -0.83104972]])

In [218]:
import numpy as np

# Example matrix
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Z-score standardization
standardized_matrix = (matrix - np.mean(matrix, axis=0)) / np.std(matrix, axis=0)

print(standardized_matrix)


[[-1.22474487 -1.22474487 -1.22474487]
 [ 0.          0.          0.        ]
 [ 1.22474487  1.22474487  1.22474487]]
