In [1]:
import pandas as pd
import numpy as np

In [2]:
def covar(
    X: pd.core.series.Series,
    y: pd.core.series.Series
) -> int:
    covar = np.mean(X*y) - np.mean(X)*np.mean(y)
    return covar


def calc_corr_coeff(
    X: pd.core.series.Series,
    y: pd.core.series.Series
) -> int:

    # covariance(X, X) = variance(X)
    corr_coeff = covar(X, y)/(np.sqrt(covar(X, X))*np.sqrt(covar(y, y)))
    return corr_coeff
    

def calc_correlation(
    X: pd.core.frame.DataFrame,
    y: pd.core.series.Series
) -> np.ndarray:
    
    correlations = []
    for i in X:
        correlations.append(calc_corr_coeff(X[i], y))
        
    return np.array(correlations)


def calc_zscore(feature):
    mean = np.mean(feature)
    std = np.std(feature)
        
    z_score = (feature-mean)/std
    
    return z_score

In [3]:
def clean_data(X:pd.core.frame.DataFrame,
               y:pd.core.frame.DataFrame,
               threshold:float=3.5
              ) -> tuple:
    '''This function will remove outliers(if any) with the help of Z-score value.
According to National Institute of Standards and Technology, Z-scores with an absolute value of greater than 3.5 be labeled as potential outliers.'''
    
    drop_indices = np.empty(0)
    
    for feature_name in X:
        
        feature = X[feature_name]

        z_score = calc_zscore(feature)
        
        outliers = feature.loc[np.abs(z_score)>threshold]
        drop_indices = np.append(outliers.index, drop_indices)

    return X.drop(drop_indices, inplace=False).reset_index(drop=True), y.drop(drop_indices, inplace=False).reset_index(drop=True)
        
        

In [4]:
def train_test_split(X:pd.core.frame.DataFrame,
                     y:pd.core.frame.DataFrame,
                     train_size:int = None,
                     test_size:int = None
                    ) -> tuple[pd.core.frame.DataFrame]:
    
    if train_size is test_size is None:
        raise TypeError('You must specify either train_size or test_size')
    elif train_size is None:
        train_size = 1 - test_size
    elif test_size is None:
        test_size = 1 - train_size

    indices = np.array(y.index)
    np.random.shuffle(indices)
    
    break_at = int(len(indices)*0.2)
    
    test_indices = indices[:break_at]
    train_indices = indices[break_at:]
    
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]

    # y_train and y_test are converted to DataFrames for future use when we perform arithmetical operations on the matrices.
    return (
        X_train.reset_index(drop=True),
        y_train.reset_index(drop=True).to_frame(),
        X_test.reset_index(drop=True),
        y_test.reset_index(drop=True).to_frame()
    )


In [5]:
class Standarize:
    
    mean = std = None
    
    def fit(self, X):
        self.mean = X.mean(axis=0)
        self.std = X.std(axis=0)
        
        
    def fit_transform(self, X):
        self.fit(X)
        X_scaled = (X - self.mean)/self.std
        return X_scaled
        

    def transform(self, X):
        try:
            X_scaled = (X - self.mean)/self.std
            return X_scaled
        except TypeError:
            raise TypeError('No data has been provided to calculate mean and standard deviation')



In [6]:
def grad_desc(X_poly, y_train, learning_rate=0.01, n_iterations=1000):
    order = X_poly.shape
    m, n = order[0], order[1]
    
    theta = np.zeros((n, 1))

    for i in range(n_iterations):
        # It will calculate the predicted values of the model.
        y_pred = X_poly.dot(theta)

        # It will calculate the the residuals in predictions.
        residuals = y_pred - y_train

        # multiplying the coefficients of theta as per the chain rule of differentiation and then multiplying with 1/2m. This will give the gradient of the cost function.
        gradients = (1/m)*X_poly.T.dot(residuals)

        step_size = gradients*learning_rate

        theta -= step_size

        if i%1000 == 0: print(f'{i}th iteration done. Cost = {1/(2*m)*(residuals**2).sum().iloc[0]}')

    return theta, residuals
        
        
        

In [7]:
class Linear_regression:
    '''
    A class to perform Linear(and Polynomial) Regression depending on the value of the degree provided.
    Data should be first standarize before training and testing the regression model.

    ...
    Attributes
    ----------

    degree: int
        Degree that to be specified for Polynomial Linear Regression, by default degree = 1.

    theta: numpy.ndarray[float]
        Array containing Bias intercept and Coefficient terms.

    residuals: numpy.ndarray[float]
        Array containing difference between the predicted values and the actual target values of y.

    min_cost: float
        Minimum value cost function of the model.

    y_train_pred: numpy.ndarray[float]
        Array containing predicted values of trained model(y_train).

    y_test_pred: numpy.ndarray[float]
        Array containing predicted values of testes model(y_test).
        

    Methods
    -------
    fit(X_train, y_train, learning_rate = 0.01, n_iterations = 1000)
        Will train the model.
    
    predict(X_test)
        will return the probable values of y.
    
    '''

    def __init__(self, degree=1):
        self.degree = degree
        
    
    def fit(self, X_train, y_train, learning_rate = 0.01, n_iterations = 1000):
        '''
        Equation of the line is given by:-
        
        y_pred = theta_0 + (theta_1)x + (theta_2)x^2 + ...

        The method will calculate the values of theta_0, theta_1 and theta_2 using gradient descent method.
        '''

        # X_poly = poly_transformation(X_train, degree=self.degree)

        self.theta, self.residuals = grad_desc(X_poly, y_train, learning_rate=learning_rate, n_iterations=n_iterations)

        # Cost_Function(J) = 1/(2m) * summation((y_pred-y_actual)^2)
        self.min_cost = 1/2*((self.residuals)**2).mean()

        # Predicted values of already trained data
        self.y_train_pred = X_poly.dot(self.theta)
        

    def predict(self, X_test):
        '''This method will predict the values of y for the given values of X using the coefficients of the trained model.'''
        # X_poly = poly_transformation(X_test, degree=self.degree)
        self.y_test_pred = X_poly.dot(self.theta)

        return self.y_test_pred
        

    def check_performance(self, y_train, y_test):
        '''Displays the performance of trained and tested model'''
        performance = {
            'Training Data':[r2_score(y_train, self.y_train_pred)],
            'Testing Data':[r2_score(y_test, self.y_test_pred)]
        }
        perf_df = pd.DataFrame(performance, index=['R2 Score'])
        return perf_df



In [8]:
def r2_score(
    y:np.ndarray[float],
    y_pred:np.ndarray[float]
) -> float:

    # From the formula, it is clear that mean of sum of squared residuals = 2*(minimum value of cost_func)
    RSS = np.mean((y - y_pred)**2)
    
    # from the formula, Mean of Sum of Squared of variation from mean is same as variation
    variance = covar(y, y)

    r2_score = 1 - RSS/variance
    return r2_score

In [9]:
 def poly_transformation(degree, n_features):
     def convert(number, base, n_features):
         res = ''
         while number != 0:
             res += str(number%base)
             number //= base
    
         result = res[::-1]
         return '0'*(n_features - len(result)) + result
    
     powers = []
     for i in range((degree+1)**n_features):
         power = tuple(map(int, list(convert(i, degree+1, n_features))))
         if sum(power) <= degree:
             powers.append(power)
    
     return powers

In [10]:
df = pd.read_csv(r"C:\Users\anpar\Python\Pandas\Polynomialdata_train.csv")
X = df.iloc[:, 1:]
y = df.iloc[:, 0]
n_features = len(X.columns)
degree = 6

In [11]:
X, y = clean_data(X, y)

In [12]:
powers = poly_transformation(degree, n_features)
powers

[(0, 0, 0),
 (0, 0, 1),
 (0, 0, 2),
 (0, 0, 3),
 (0, 0, 4),
 (0, 0, 5),
 (0, 0, 6),
 (0, 1, 0),
 (0, 1, 1),
 (0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 2, 0),
 (0, 2, 1),
 (0, 2, 2),
 (0, 2, 3),
 (0, 2, 4),
 (0, 3, 0),
 (0, 3, 1),
 (0, 3, 2),
 (0, 3, 3),
 (0, 4, 0),
 (0, 4, 1),
 (0, 4, 2),
 (0, 5, 0),
 (0, 5, 1),
 (0, 6, 0),
 (1, 0, 0),
 (1, 0, 1),
 (1, 0, 2),
 (1, 0, 3),
 (1, 0, 4),
 (1, 0, 5),
 (1, 1, 0),
 (1, 1, 1),
 (1, 1, 2),
 (1, 1, 3),
 (1, 1, 4),
 (1, 2, 0),
 (1, 2, 1),
 (1, 2, 2),
 (1, 2, 3),
 (1, 3, 0),
 (1, 3, 1),
 (1, 3, 2),
 (1, 4, 0),
 (1, 4, 1),
 (1, 5, 0),
 (2, 0, 0),
 (2, 0, 1),
 (2, 0, 2),
 (2, 0, 3),
 (2, 0, 4),
 (2, 1, 0),
 (2, 1, 1),
 (2, 1, 2),
 (2, 1, 3),
 (2, 2, 0),
 (2, 2, 1),
 (2, 2, 2),
 (2, 3, 0),
 (2, 3, 1),
 (2, 4, 0),
 (3, 0, 0),
 (3, 0, 1),
 (3, 0, 2),
 (3, 0, 3),
 (3, 1, 0),
 (3, 1, 1),
 (3, 1, 2),
 (3, 2, 0),
 (3, 2, 1),
 (3, 3, 0),
 (4, 0, 0),
 (4, 0, 1),
 (4, 0, 2),
 (4, 1, 0),
 (4, 1, 1),
 (4, 2, 0),
 (5, 0, 0),
 (5, 0, 1),
 (5, 1, 0),
 (6,

In [13]:
X_train, y_train, X_test, y_test = train_test_split(X, y, train_size=0.8)

In [14]:
scalar_X = Standarize()
X_train = scalar_X.fit_transform(X_train)
X_test = scalar_X.transform(X_test)
X_test

Unnamed: 0,feature 1,feature 2,feature 3
0,0.906362,0.698293,-0.946915
1,0.186399,-0.384501,0.666389
2,1.025478,-0.960435,-0.464492
3,-0.322520,0.004790,-1.097093
4,0.333605,0.325041,1.203920
...,...,...,...
9980,0.567375,0.422553,1.037949
9981,-0.488812,0.948969,-1.563696
9982,0.167209,0.228802,0.032339
9983,0.103243,0.199117,1.127694


In [15]:
# scalar_y = Standarize()
# y_train = scalar_y.fit_transform(y_train)
# y_test = scalar_y.transform(y_test)
# y_test

In [16]:
X1, X2, X3 = X_train.iloc[:, 0], X_train.iloc[:, 1], X_train.iloc[:, 2]

In [17]:
X_poly = np.zeros(len(X1))
((len(X1), 1))
for i, j, k in powers[::-1]:
    X_poly = np.c_[(X1**i)*(X2**j)*(X3**k), X_poly]
X_poly = np.delete(X_poly, -1, 1)

In [18]:
regressor = Linear_regression(degree=degree)
regressor.fit(X_poly, y_train, learning_rate=3e-4, n_iterations=20000)
regressor.theta

0th iteration done. Cost = 41910747047905.555
1000th iteration done. Cost = 267820863.1242825
2000th iteration done. Cost = 90817581.90592209
3000th iteration done. Cost = 53982828.8016932
4000th iteration done. Cost = 38898010.02412281
5000th iteration done. Cost = 30008283.035663035
6000th iteration done. Cost = 23707545.19390737
7000th iteration done. Cost = 18933697.1586302
8000th iteration done. Cost = 15235277.736816315
9000th iteration done. Cost = 12344668.01698563
10000th iteration done. Cost = 10073831.847038811
11000th iteration done. Cost = 8282253.492302696
12000th iteration done. Cost = 6862789.1305120615
13000th iteration done. Cost = 5733146.773709778
14000th iteration done. Cost = 4829899.606439042
15000th iteration done. Cost = 4104038.7830409924
16000th iteration done. Cost = 3517609.186474799
17000th iteration done. Cost = 3041148.1666690605
18000th iteration done. Cost = 2651730.0090569905
19000th iteration done. Cost = 2331470.09933993


array([[ 1.01663548e+03],
       [ 1.98871580e+02],
       [-2.99714745e+03],
       [-6.53072776e+01],
       [ 6.49933157e+02],
       [-3.55285801e+02],
       [ 7.80712098e+04],
       [ 1.70592128e+02],
       [-9.96092368e+02],
       [-7.08037940e+01],
       [ 2.90358456e+02],
       [-1.24028433e+02],
       [ 3.34308940e+04],
       [ 7.19911099e+02],
       [-5.04136278e+01],
       [ 1.04672566e+02],
       [-7.84511045e-01],
       [ 5.94919604e+03],
       [ 1.27250671e+01],
       [ 2.04121874e+01],
       [ 4.05703964e-01],
       [ 5.61623146e+02],
       [-1.75085243e+02],
       [-5.68689813e+00],
       [ 4.22513763e+01],
       [-4.64318766e+00],
       [ 7.55230110e+00],
       [ 8.91687944e+00],
       [ 2.29100572e+02],
       [-3.93825562e+03],
       [ 2.61768280e+01],
       [ 1.86971016e+03],
       [-7.14313605e+02],
       [ 1.82854208e+05],
       [-5.83668670e+02],
       [-3.96737219e+02],
       [ 1.69915541e+03],
       [-1.29375709e+02],
       [ 6.5

In [19]:
X1, X2, X3 = X_test.iloc[:, 0], X_test.iloc[:, 1], X_test.iloc[:, 2]

In [20]:
X_poly = np.zeros(len(X1))
((len(X1), 1))
for i, j, k in powers[::-1]:
    X_poly = np.c_[(X1**i)*(X2**j)*(X3**k), X_poly]
X_poly = np.delete(X_poly, -1, 1)

In [21]:
y_pred = regressor.predict(X_poly)

In [22]:
regressor.check_performance(y_train, y_test)

Unnamed: 0,Training Data,Testing Data
R2 Score,1.0,1.0


In [23]:
regressor.min_cost

target    2.066620e+06
dtype: float64

In [24]:
test_df = pd.read_csv(r"C:\Users\anpar\Python\ML_Bootcamp_Aadi\Algo\Test_Datasets\Polynomialdata_test.csv")

In [25]:
X_new = test_df.iloc[:, 1:]

In [26]:
X_new_scaled = scalar_X.transform(X_new)

In [27]:
X1, X2, X3 = X_new_scaled.iloc[:, 0], X_new_scaled.iloc[:, 1], X_new_scaled.iloc[:, 2]

In [28]:
X_new_poly = np.zeros(len(X1))
((len(X1), 1))
for i, j, k in powers[::-1]:
    X_new_poly = np.c_[(X1**i)*(X2**j)*(X3**k), X_new_poly]
X_new_poly = np.delete(X_new_poly, -1, 1)

In [29]:
X_new_poly.shape

(10000, 84)

In [33]:
y_new = X_new_poly.dot(regressor.theta)

In [34]:
y_new.shape

(10000, 1)

In [35]:
test_df['target'] = y_new

In [36]:
test_df

Unnamed: 0,ID,feature 1,feature 2,feature 3,target
0,16588,36.247643,-2.372704,43.835947,564139.736807
1,17513,5.903779,5.460086,-96.149414,954752.146574
2,14448,-37.298157,6.869786,-14.084487,18658.218443
3,13972,-40.080077,2.395066,-43.432941,729886.192919
4,18321,-20.417799,-2.194459,5.737226,343.927439
...,...,...,...,...,...
9995,19691,-13.571554,2.931176,76.902716,216000.937083
9996,10512,-2.015904,-2.827700,4.999604,1239.928607
9997,17741,47.095192,-1.977461,39.044902,911914.449873
9998,18381,-1.808178,-0.693782,57.918628,73387.395018


In [37]:
test_df.to_csv(r"C:\Users\anpar\Python\ML_Bootcamp_Aadi\Algo\Test_Datasets\Labeled_Polynomialdata_test.csv", index=False)