In [1]:
import pandas as pd 
import numpy as np
import tqdm
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/1.csv')

In [3]:
class MyLinearRegression:
    def __init__(self):
        self.w = None
        
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, gd = True):
        rows, cols = X.shape
        if not gd:
            self.w = np.linalg.inv(X.T @ X) @ X.T @ y
        else:
            self.w = np.ones(cols)
            for _ in tqdm.tqdm(range(epochs)):
                predicts = X @ self.w
                errors = (predicts - y)
                self.w -= learning_rate * (X.T @ errors) / rows
            
        
    def predict(self, X: np.ndarray):
        return X @ self.w

    def score(self, X: np.ndarray, y: np.array):
        y_pred = self.predict(X)
        return  1 - ((y - y_pred)**2).sum() / ((y - y.mean())**2).sum()


class PolynomialRegression: 
    def __init__(self, degree=2, epochs = 100):
        self.degree = degree
        self.epochs = epochs
        self.powers = None
        self.learning_rate = 0.01
        self.lr = MyLinearRegression()
    
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, gd = True):
        X = self.add_bias(X)
        self.powers = self.get_power_combinations(X.shape[1], self.degree)
        X = self.get_pr_features(X, self.powers)
        self.lr.fit(X, y, epochs=epochs, learning_rate=learning_rate, gd=gd)
        
    def score(self, X: np.ndarray, y: np.array):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return self.lr.score(X, y)
    
    
    def predict(self, X: np.ndarray):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return X @ self.lr.w
        
    def add_bias(self, X: np.ndarray): 
        return np.column_stack((np.ones(X.shape[0]) , X))
        
    
    def get_pr_features(self, X: np.ndarray, powers: np.ndarray = None):
        if powers is None:
            powers = self.get_power_combinations(X.shape[1], self.degree)
        return np.prod(np.power(X[:, np.newaxis], powers), axis=-1)


    def get_power_combinations(self, n: int, m: int): 
        if n == 1:
            return [[m]]
        if m == 0:
            return [[0 for _ in range(n)]]
        sols = []
        for i in range(m+1):
            for sol in self.get_power_combinations(n-1, m - i):
                sols.append(
                    [i] + sol
                )
        return sols

In [4]:
def train_test_split_manual(df, test_size=0.2, random_state=None, shuffle=True):
    if shuffle:
        df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

    if isinstance(test_size, float):
        test_size = int(len(df) * test_size)

    train_df = df.iloc[:-test_size].reset_index(drop=True)
    test_df = df.iloc[-test_size:].reset_index(drop=True)

    return train_df, test_df


In [5]:
NUMERICAL_FEATURES = [
    'Rooms', 'Distance', 'Bedroom2', 'Postcode', 'Bathroom', 'Car',
    'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount',
]
CATEGORICAL_FEATURES = [
    'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Date'
]
PREDICT = 'Price'

train, test = train_test_split_manual(df, test_size=.2, random_state=40, shuffle=True)

In [15]:
def normalize_df(matrix: np.ndarray):
    return (matrix - matrix.min(axis=0)) / (matrix.max(axis=0) - matrix.min(axis=0))
def standardize_df(matrix: np.ndarray):
    return (matrix - matrix.mean(axis=0)) / matrix.std(axis=0)

In [12]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor

In [13]:
pr2 = PolynomialFeatures(degree=1)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
gdr = SGDRegressor(
    loss='squared_error', 
    penalty='l2', 
    alpha=0.0001, 
    l1_ratio=0.15, 
    fit_intercept=True, 
    max_iter=1000, 
    tol=0.001, 
    shuffle=True, 
    verbose=0, 
    epsilon=0.1, 
    random_state=None, 
    learning_rate='constant', 
    eta0=0.01, 
    power_t=0.25, 
    early_stopping=False, 
    validation_fraction=0.1, 
    n_iter_no_change=5, 
    warm_start=False, 
    average=False
)
lr = LinearRegression()


train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])

train_numeric = normalize_df(train_numeric)
# train_numeric = standardize_df(train_numeric)

train_numeric = pr2.fit_transform(train_numeric)
y = train[PREDICT].values
gdr.fit(train_numeric, y)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])
test_numeric = normalize_df(test_numeric)

# test_numeric = standardize_df(test_numeric)
test_numeric = pr2.fit_transform(test_numeric)

y_test = test[PREDICT].values
gdr.score(test_numeric, y_test)

0.3255252439672187

In [16]:
pr1 = PolynomialRegression(degree=1)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])

# train_numeric = normalize_df(train_numeric)
train_numeric = standardize_df(train_numeric)

y = train[PREDICT].values
pr1.fit(train_numeric, y, epochs=100, learning_rate=.15, gd=True)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])

# test_numeric = normalize_df(test_numeric)
test_numeric = standardize_df(test_numeric)

y_test = test[PREDICT].values
pr1.score(test_numeric, y_test)

100%|██████████| 100/100 [00:00<00:00, 47940.38it/s]


np.float64(0.49444759448310355)

-2912058.3894756055