In [129]:
import pandas as pd 
import numpy as np
import tqdm

In [130]:
df = pd.read_csv('./data/1.csv')

In [131]:
import matplotlib.pyplot as plt 
import seaborn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [138]:
class MyLinearRegression:
    def __init__(self):
        self.w = None
        
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, gd = True):
        rows, cols = X.shape
        if not gd:
            self.w = np.linalg.inv(X.T @ X) @ X.T @ y
        else:
            self.w = np.ones(cols)
            for _ in tqdm.tqdm(range(epochs)):
                predicts = X @ self.w
                errors = (predicts - y)
                self.w -= learning_rate * (X.T @ errors) / rows
            
        
    def predict(self, X: np.ndarray):
        return X @ self.w

    def score(self, X: np.ndarray, y: np.array):
        y_pred = self.predict(X)
        return ((y - y_pred)**2).sum() / ((y - y.mean())**2).sum()


class PolynomialRegression: 
    def __init__(self, degree=2, epochs = 100):
        self.degree = degree
        self.epochs = epochs
        self.powers = None
        self.learning_rate = 0.01
        self.lr = MyLinearRegression()
    
    def fit(self, X: np.ndarray, y: np.array, epochs=100, learning_rate=0.01, gd = True):
        X = self.add_bias(X)
        self.powers = self.get_power_combinations(X.shape[1], self.degree)
        X = self.get_pr_features(X, self.powers)
        self.lr.fit(X, y, epochs=epochs, learning_rate=learning_rate, gd=gd)
        
    def score(self, X: np.ndarray, y: np.array):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return self.lr.score(X, y)
    
    
    def predict(self, X: np.ndarray):
        X = self.add_bias(X)
        X = self.get_pr_features(X, self.powers)
        return X @ self.lr.w
        
    def add_bias(self, X: np.ndarray): 
        return np.column_stack((np.ones(X.shape[0]) , X))
        
    
    def get_pr_features(self, X: np.ndarray, powers: np.ndarray = None):
        if powers is None:
            powers = self.get_power_combinations(X.shape[1], self.degree)
        return np.prod(np.power(X[:, np.newaxis], powers), axis=-1)


    def get_power_combinations(self, n: int, m: int): 
        if n == 1:
            return [[m]]
        if m == 0:
            return [[0 for _ in range(n)]]
        sols = []
        for i in range(m+1):
            for sol in self.get_power_combinations(n-1, m - i):
                sols.append(
                    [i] + sol
                )
        return sols

In [139]:
NUMERICAL_FEATURES = [
    'Rooms', 'Distance', 'Bedroom2', 'Postcode', 'Bathroom', 'Car',
    'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount',
]
CATEGORICAL_FEATURES = [
    'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname', 'Date'
]
PREDICT = 'Price'

train, test = train_test_split(df, test_size=.2, random_state=40)

In [165]:
pr1 = PolynomialRegression(degree=2)
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])
train_numeric = (train_numeric - train_numeric.min(axis=0)) / (train_numeric.max(axis=0) - train_numeric.min(axis=0))
y = train[PREDICT].values
pr1.fit(train_numeric, y, epochs=100, learning_rate=.5, gd=True)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])
test_numeric = (test_numeric - test_numeric.min(axis=0)) / (test_numeric.max(axis=0) - test_numeric.min(axis=0))
y_test = test[PREDICT].values
pr1.score(test_numeric, y_test)

100%|██████████| 100/100 [00:00<00:00, 13408.04it/s]


np.float64(3.619971465654431e+22)

In [164]:
pr2 = PolynomialFeatures(degree=6)
lr = LinearRegression()

train_numeric = imp.fit_transform(train[NUMERICAL_FEATURES])
train_numeric = pr2.fit_transform(train_numeric)
y = train[PREDICT].values
lr.fit(train_numeric, y)


test_numeric = imp.fit_transform(test[NUMERICAL_FEATURES])
test_numeric = pr2.fit_transform(test_numeric)
y_test = test[PREDICT].values
lr.score(test_numeric, y_test)

-2912058.3894756055