In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"C:\Users\anpar\Python\Pandas\Lineardata_train.csv")
X: pd.core.frame.DataFrame = df.iloc[:, 1:]
y: pd.core.series.Series = df.iloc[:, 0]

In [3]:
def covar(
    X: pd.core.series.Series,
    y: pd.core.series.Series
) -> int:
    covar = np.mean(X*y) - np.mean(X)*np.mean(y)
    return covar


def calc_corr_coeff(
    X: pd.core.series.Series,
    y: pd.core.series.Series
) -> int:

    # covariance(X, X) = variance(X)
    corr_coeff = covar(X, y)/(np.sqrt(covar(X, X))*np.sqrt(covar(y, y)))
    return corr_coeff
    

def calc_correlation(
    X: pd.core.frame.DataFrame,
    y: pd.core.series.Series
) -> np.ndarray:
    
    correlations = []
    for i in X:
        correlations.append(calc_corr_coeff(X[i], y))
        
    return np.array(correlations)


def calc_zscore(feature):
    mean = np.mean(feature)
    std = np.std(feature)
        
    z_score = (feature-mean)/std
    
    return z_score

In [4]:
def clean_data(X:pd.core.frame.DataFrame,
               y:pd.core.frame.DataFrame,
               threshold:float=3.5
              ) -> tuple:
    '''This function will remove outliers(if any) with the help of Z-score value.
According to National Institute of Standards and Technology, Z-scores with an absolute value of greater than 3.5 be labeled as potential outliers.'''
    
    drop_indices = np.empty(0)
    
    for feature_name in X:
        
        feature = X[feature_name]

        z_score = calc_zscore(feature)
        
        outliers = feature.loc[np.abs(z_score)>threshold]
        drop_indices = np.append(outliers.index, drop_indices)

    return X.drop(drop_indices, inplace=False).reset_index(drop=True), y.drop(drop_indices, inplace=False).reset_index(drop=True)
        
        

In [5]:
def train_test_split(X:pd.core.frame.DataFrame,
                     y:pd.core.frame.DataFrame,
                     train_size:int = None,
                     test_size:int = None
                    ) -> tuple[pd.core.frame.DataFrame]:
    
    if train_size is test_size is None:
        raise TypeError('You must specify either train_size or test_size')
    elif train_size is None:
        train_size = 1 - test_size
    elif test_size is None:
        test_size = 1 - train_size

    indices = np.array(y.index)
    np.random.shuffle(indices)
    
    break_at = int(len(indices)*0.2)
    
    test_indices = indices[:break_at]
    train_indices = indices[break_at:]
    
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]

    # y_train and y_test are converted to DataFrames for future use when we perform arithmetical operations on the matrices.
    return (
        X_train.reset_index(drop=True),
        y_train.reset_index(drop=True).to_frame(),
        X_test.reset_index(drop=True),
        y_test.reset_index(drop=True).to_frame()
    )


In [6]:
class Standarize:
    
    mean = std = None
    
    def fit(self, X):
        self.mean = X.mean()
        self.std = X.std()
        
        
    def fit_transform(self, X):
        self.fit(X)
        X_scaled = (X - self.mean)/self.std
        return X_scaled
        

    def transform(self, X):
        try:
            X_scaled = (X - self.mean)/self.std
            return X_scaled
        except TypeError:
            raise TypeError('No data has been provided to calculate mean and standard deviation')



In [7]:
def grad_desc(X_poly, y_train, learning_rate=0.01, n_iterations=1000):
    order = X_poly.shape
    m, n = order[0], order[1]
    
    theta = np.random.randn(n, 1)

    for _ in range(n_iterations):
        # It will calculate the predicted values of the model.
        y_pred = X_poly.dot(theta)

        # It will calculate the the residuals in predictions.
        residuals = y_pred - y_train

        # multiplying the coefficients of theta as per the chain rule of differentiation and then multiplying with 1/2m. This will give the gradient of the cost function.
        gradients = (1/m)*X_poly.T.dot(residuals)

        step_size = gradients*learning_rate

        theta -= step_size

    return theta, residuals
        
        
        

In [8]:
class Linear_regression:
    '''
    A class to perform Linear(and Polynomial) Regression depending on the value of the degree provided.
    Data should be first standarize before training and testing the regression model.

    ...
    Attributes
    ----------

    degree: int
        Degree that to be specified for Polynomial Linear Regression, by default degree = 1.

    theta: numpy.ndarray[float]
        Array containing Bias intercept and Coefficient terms.

    residuals: numpy.ndarray[float]
        Array containing difference between the predicted values and the actual target values of y.

    min_cost: float
        Minimum value cost function of the model.

    y_train_pred: numpy.ndarray[float]
        Array containing predicted values of trained model(y_train).

    y_test_pred: numpy.ndarray[float]
        Array containing predicted values of testes model(y_test).
        

    Methods
    -------
    fit(X_train, y_train, learning_rate = 0.01, n_iterations = 1000)
        Will train the model.
    
    predict(X_test)
        will return the probable values of y.
    
    '''

    def __init__(self, degree=1):
        self.degree = degree
        
    
    def fit(self, X_poly, y_train, learning_rate = 0.01, n_iterations = 1000):
        '''
        Equation of the line is given by:-
        
        y_pred = theta_0 + (theta_1)x + (theta_2)x^2 + ...

        The method will calculate the values of theta_0, theta_1 and theta_2 using gradient descent method.
        '''

        # X_poly = poly_transformation(X_train, degree=self.degree)

        self.theta, self.residuals = grad_desc(X_poly, y_train, learning_rate=learning_rate, n_iterations=n_iterations)

        # Cost_Function(J) = 1/(2m) * summation((y_pred-y_actual)^2)
        self.min_cost = 1/2*((self.residuals)**2).mean()

        # Predicted values of already trained data
        self.y_train_pred = X_poly.dot(self.theta)
        

    def predict(self, X_poly):
        '''This method will predict the values of y for the given values of X using the coefficients of the trained model.'''
        # X_poly = poly_transformation(X_test, degree=self.degree)
        self.y_test_pred = X_poly.dot(self.theta)

        return self.y_test_pred
        

    def check_performance(self, y_train, y_test):
        '''Displays the performance of trained and tested model'''
        performance = {
            'Training Data':[r2_score(y_train, self.y_train_pred)],
            'Testing Data':[r2_score(y_test, self.y_test_pred)]
        }
        perf_df = pd.DataFrame(performance, index=['R2 Score'])
        return perf_df



In [9]:
def r2_score(
    y:np.ndarray[float],
    y_pred:np.ndarray[float]
) -> float:

    # From the formula, it is clear that mean of sum of squared residuals = 2*(minimum value of cost_func)
    RSS = np.mean((y - y_pred)**2)
    
    # from the formula, Mean of Sum of Squared of variation from mean is same as variation
    variance = covar(y, y)

    r2_score = 1 - RSS/variance
    return r2_score

In [10]:
X, y = clean_data(X, y)

In [11]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
X_train

Unnamed: 0,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,feature 10,feature 11,feature 12,feature 13,feature 14,feature 15,feature 16,feature 17,feature 18,feature 19,feature 20
0,-0.933866,-0.271743,4.297027,-2.689633,3.527118,8.493711,6.322748,7.119918,24.004558,2.048310,3.059854,-7.971563,25.261327,4.020216,77.999324,-219.431183,-22.881631,4.382672,-56.601876,-23.487795
1,0.974900,2.201608,-9.131526,-2.506906,-0.082792,-5.282314,-3.123291,-4.137219,39.058464,-2.631775,12.338929,-7.306256,27.753468,7.280983,-33.654993,-259.383275,25.481158,32.268814,13.061557,-4.518072
2,0.302104,-1.935321,7.002041,1.552446,1.978466,-0.756413,7.447980,2.872463,2.239483,4.647549,-9.825635,-0.918782,4.893204,0.818156,102.025298,253.957200,-8.899273,6.522940,-37.498308,-7.001502
3,-0.967965,0.439292,7.477737,-1.796307,1.788112,-3.108480,11.248671,-2.393599,-23.152514,-1.068504,-12.429368,8.028816,-13.611528,-5.199498,54.362539,0.536410,-26.625083,0.711405,-9.562017,34.214814
4,-0.039989,2.613206,-7.171607,-2.412543,9.296994,5.145716,-0.708243,11.112595,-3.598140,-4.212489,10.971123,4.439491,-9.772215,-9.164360,-3.737449,-180.218557,1.386503,15.081092,-53.440308,-37.141170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39622,0.403935,1.299692,-1.446872,1.307457,0.339518,1.114038,19.260068,-4.715849,26.506694,-11.774724,11.334822,-15.027058,13.321738,-7.668021,9.744370,-237.726724,29.244413,11.944964,-26.987630,-5.459840
39623,-1.228994,-2.821468,-2.736472,2.573112,-2.952344,8.838214,11.208278,34.095162,41.144354,2.043327,-8.963330,3.753341,-13.587860,27.383559,73.260788,230.199950,46.552958,-5.920517,56.745520,-1.755838
39624,-1.139966,3.103780,2.796686,-0.745948,1.885548,0.797683,4.964427,-8.781012,21.680760,-6.149210,4.199720,2.023388,-0.374744,3.050623,48.399043,-114.891651,10.191332,23.431341,-80.374871,16.038082
39625,1.144112,-1.729375,-1.018486,-0.724338,0.123934,-4.446755,7.131620,-0.352710,-3.605517,-6.093751,11.269558,-11.618580,22.319503,11.685977,25.434327,-140.339747,5.785966,26.318071,49.940362,-5.303336


In [13]:
scalar_X = Standarize()
X_train = scalar_X.fit_transform(X_train)
X_test = scalar_X.transform(X_test)

In [14]:
X_train_poly = np.c_[np.ones(X_train.shape[0]), X_train]
X_test_poly = np.c_[np.ones(X_test.shape[0]), X_test]

In [15]:
# scalar_y = Standarize()
# y_train = scalar_y.fit_transform(y_train)
# y_test = scalar_y.fit_transform(y_test)

In [16]:
regressor = Linear_regression()

In [17]:
regressor.fit(X_train_poly, y_train)
y_pred = regressor.predict(X_test_poly)

In [18]:
regressor.check_performance(y_train, y_test)

Unnamed: 0,Training Data,Testing Data
R2 Score,1.0,1.0


In [19]:
y_test

Unnamed: 0,target
0,-6021.885989
1,-10179.879559
2,15507.634381
3,-13924.461079
4,9939.295508
...,...
9901,-24584.310947
9902,-13821.243159
9903,2165.503562
9904,-7426.001664


In [20]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,-6021.657801
1,-10179.115120
2,15506.902526
3,-13923.896765
4,9938.943468
...,...
9901,-24583.209784
9902,-13820.701977
9903,2165.665357
9904,-7425.688796


In [21]:
regressor.min_cost

target    0.131911
dtype: float64

In [87]:
test_df = pd.read_csv(r"C:\Users\anpar\Python\ML_Bootcamp_Aadi\Algo\Test_Datasets\Lineardata_test.csv")

In [88]:
X_new = test_df.iloc[:, 1:]

In [89]:
X_new_scaled = scalar_X.transform(X_new)

In [90]:
X_new_poly = np.c_[np.ones(X_new_scaled.shape[0]), X_new_scaled]

In [91]:
y_new = regressor.predict(X_new_poly)

In [92]:
test_df['target'] = y_new

In [93]:
test_df

Unnamed: 0,ID,feature 1,feature 2,feature 3,feature 4,feature 5,feature 6,feature 7,feature 8,feature 9,...,feature 12,feature 13,feature 14,feature 15,feature 16,feature 17,feature 18,feature 19,feature 20,target
0,14189,0.634260,-0.771727,4.297694,-2.022710,0.487001,2.316767,11.626397,-10.657142,9.254734,...,7.035966,5.828558,8.596358,-16.510202,-132.396813,12.213839,21.383123,-37.841857,18.892385,-5419.766244
1,14742,0.456128,3.154433,0.182006,-0.730634,-5.759530,1.448589,-10.292540,20.441249,9.943525,...,20.762134,-4.117988,-12.677107,-40.923007,367.557009,21.607952,-1.207279,-28.154185,-3.603744,17533.842685
2,13905,0.390815,-5.278284,1.625943,-1.775914,-2.461865,1.439888,6.213269,-17.212310,-17.308743,...,6.588928,27.017658,-1.806317,-18.574372,-39.807081,-18.389553,7.466803,49.904659,46.990062,4663.630202
3,10072,1.067949,4.851061,3.909243,2.005500,2.880720,1.054944,-1.652801,-4.267849,3.860784,...,5.748974,6.949329,-2.369054,-6.953435,23.932892,-1.575264,-34.352606,16.941476,5.104177,2675.303255
4,14879,-0.589611,-3.533016,-1.705736,2.063013,-0.738696,2.475443,-4.982819,-19.693204,0.660739,...,-11.731821,-8.069874,-5.881325,-26.060090,244.017215,-23.257490,9.819982,-34.672912,9.324556,3383.641048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,19851,1.967136,-6.819440,3.597360,-0.482508,4.679231,0.861442,8.810749,-20.384415,20.427585,...,12.767614,52.530303,20.517596,-42.428871,35.227784,-9.244004,-18.759314,-42.260514,-2.261597,4197.817956
9996,11818,-0.656285,-3.294604,12.222256,-0.184108,8.286414,-6.437312,-19.611691,-13.487207,23.986240,...,-15.720693,26.401396,11.421553,-104.558180,-318.607244,-7.446830,-6.457686,12.002813,-20.317446,-16011.348555
9997,15552,-0.349932,4.611955,-2.814735,1.851754,-2.354195,6.639817,-25.299515,-3.145723,-41.248652,...,-3.216492,-17.299449,-8.975811,59.531358,244.333157,28.656620,11.294024,1.441938,4.395482,9436.383773
9998,12161,0.295833,1.825552,-8.824731,2.119270,-4.629213,-2.354794,0.919091,52.204003,7.350877,...,11.686231,7.259193,7.733906,54.680591,-472.335102,-24.281308,-16.099344,10.744757,6.578695,-15188.569743


In [94]:
test_df.to_csv(r"C:\Users\anpar\Python\ML_Bootcamp_Aadi\Algo\Test_Datasets\Labeled_Lineardata_test.csv", index=False)