In [None]:
# This Jupyter Notebook has the code to construct linear regression and polynomial regression models on the S & P
# data that was scraped in the previous Jupyter Notebook

In [None]:
# This code block imports python libraries

from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
from IPython.core.display import display, HTML
import pickle
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
%matplotlib inline

In [None]:
# This code block sets the display parameters for pandas dataframes in this notebook

pd.set_option('display.max_columns', 63)
pd.set_option('display.max_rows', 505)
pd.set_option('display.precision', 3)

In [None]:
# This code block reads the pickle file from the previous Jupyter notebook to recreate the full dataframe

# If you want to do regression analysis for multiple different time spans, you can run the scrape multiple times,
# and give each dataframe pkl file a different name (advisably a name indicating the start and end dates)

# Set the name of the file to match the dataframe you want to analyze

with open('Full_S_and_P_DF.pkl', 'rb') as read_file:
    Full_S_and_P_DF = pickle.load(read_file)

In [None]:
# To view the dataframe, de-commentify the code line below

#display(Full_S_and_P_DF)

In [None]:
# This code block creates a function to remove commas from a string/object (will help convert it into a float)

def NoCommas(string):
    
    Output = ""
    String_Index = 0
    
    while(String_Index < len(string)):
        if(string[String_Index] == ","):
            pass
        else:
            Output = Output + str(string[String_Index])
            
        String_Index += 1
        
    return Output

In [None]:
# This code block converts numbers saved as non-float values to float variables for the financial data
# The Industry one-hot encoded variable columns end at 'Materials' and the financial variables start right after

List_of_Columns = list(Full_S_and_P_DF.columns)

Row_Index = 0
Column_Index_Start = (List_of_Columns.index('Materials') + 1)
Column_Index = Column_Index_Start
Number_Of_Dashes = 0
Number_Of_Empty_Strings = 0
Changed_To_Float = 0

while(Column_Index < len(List_of_Columns)):
    
    Row_Index = 0
    
    Column = List_of_Columns[Column_Index]

    while(Row_Index < len(Full_S_and_P_DF)):
        
        if(pd.isna(Full_S_and_P_DF[Column][Row_Index])):
            pass
        elif(Full_S_and_P_DF[Column][Row_Index] == "-"):
            pass
        elif(Full_S_and_P_DF[Column][Row_Index] == ""):
            pass
        elif(type(Full_S_and_P_DF[Column][Row_Index]) is float):
            pass
        else:
            Full_S_and_P_DF[Column][Row_Index] = float(NoCommas(Full_S_and_P_DF[Column][Row_Index]))
            #Changed_To_Float += 1
            #if(Changed_To_Float < 30):
                #print(Column, Row_Index, (Full_S_and_P_DF[Column][Row_Index]), type(Full_S_and_P_DF[Column][Row_Index]))

        Row_Index += 1

    Column_Index += 1
    
# If you want to see the first 30 values converted to float, de-commentify the commented lines in the else statement

In [None]:
# This code block creates a dictionary to check how many float values are in each column of financial data

Row_Index = 0
Column_Index = Column_Index_Start

Column_Dict = defaultdict(int)

while(Column_Index < len(List_of_Columns)):
    
    Row_Index = 0
    Floats_In_Column = 0
    
    Column = List_of_Columns[Column_Index]

    while(Row_Index < len(Full_S_and_P_DF)):
        
        if(pd.isna(Full_S_and_P_DF[Column][Row_Index])):
            pass
        elif(Full_S_and_P_DF[Column][Row_Index] == "-"):
            pass
        elif(Full_S_and_P_DF[Column][Row_Index] == ""):
            pass
        elif(type(Full_S_and_P_DF[Column][Row_Index]) is float):
            Floats_In_Column += 1
        else:
            Full_S_and_P_DF[Column][Row_Index] = float(NoCommas(Full_S_and_P_DF[Column][Row_Index]))
            Floats_In_Column += 1

        Row_Index += 1
    
    Column_Dict[Column] = Floats_In_Column
    Column_Index += 1

# To see the dictionary, de-commentify the print statement below
#print(Column_Dict)

In [None]:
# This code block drops columns with no float values

for Column in Column_Dict:
    if(Column_Dict[Column] == 0):
        Full_S_and_P_DF = Full_S_and_P_DF.drop([Column], axis=1)

In [None]:
# To view the information on the columns and data types for this dataframe, de-commentify the code line below

#Full_S_and_P_DF.info()

In [None]:
# The financial data features scrapped from Income Statements inevitably contain a high degree of colinearity 
# This is because some fields are used to calculate other fields
# For most companies, the format is:

# Total Revenue - Cost of Revenue = Gross Profit
# Gross Profit - Operating Expenses = Operating Income
# Operating Income + or - (Non-Operating Expenses/Income from Interest, Capital Gains/Loses, etc.) = Pre-Tax Income
# Pre-Tax Income - (Tax Provision or Income Tax Expense) = Net Income

# These fields are populated for vast majority of companies in this data set
# Some companies have null Gross Profit or Operating Income fields (particularly financial or banking companies)
# The missing fields can be substituted with pseudo-values based on Total Revenue and Pre-Tax Income

In [None]:
# This code block creates a new dataframe with selected fields in order to perform regression analysis 

Regression_DF_1 = Full_S_and_P_DF.filter(['Weight', 'Information Technology', 'Consumer Discretionary',
                               'Financials','Health Care', 'Consumer Staples', 'Energy', 
                               'Telecommunication Services', 'Industrials', 'Utilities','Real Estate', 
                               'Materials', 'Total Revenue', 'Gross Profit', 'Operating Income', 'Pretax Income',
                               'Net Income Common Stockholders', 'Percent Increase']).copy()

Regression_DF_1['Percent Increase'] = (100 * Regression_DF_1['Percent Increase'])

# To view this dataframe, de-commentify the code line below

#display(Regression_DF_1)

In [None]:
# This code block drops rows that have no value for the target column (Percent Increase)
# It also drops rows that have no value for the Total Revenue, Pretax Income, or Net Income fields

Row_Index = 0

while(Row_Index < len(Regression_DF_1)):
    
    if(np.isnan(Regression_DF_1["Total Revenue"][Row_Index]) == True):
        Regression_DF_1.drop(Row_Index, axis=0, inplace=True)
    elif(np.isnan(Regression_DF_1["Pretax Income"][Row_Index]) == True):
        Regression_DF_1.drop(Row_Index, axis=0, inplace=True)
    elif(np.isnan(Regression_DF_1["Net Income Common Stockholders"][Row_Index]) == True):
        Regression_DF_1.drop(Row_Index, axis=0, inplace=True)
    elif(np.isnan(Regression_DF_1["Percent Increase"][Row_Index]) == True):
        Regression_DF_1.drop(Row_Index, axis=0, inplace=True)
    
    Row_Index += 1
    
# To view how many rows are left after some are dropped, de-commentify the code line below

#print(len(Regression_DF_1))

In [None]:
# This code block resets the index so there are no index numbers skipped for the dropped rows

Regression_DF_1.reset_index(drop=True, inplace=True)

In [None]:
# To view the dataframe, de-commentify the code line below

#display(Regression_DF_1)

In [None]:
# This code block creates lists determines the relative proportions of Cost of Revenue, Operating Expenses, and 
# Other Expenses as a proportion of total expenses going from Gross Revenue to Net Income
# This will be used to fill in pseudo-values for "Gross Profit" and "Operating Income" in rows missing those fields

Cost_Of_Revenue_Proportion_List = []
Operating_Expenses_Proportion_List = []
Other_Proportion_List = []

Row_Index = 0

while(Row_Index < len(Regression_DF_1)):
    
    if((type(Regression_DF_1["Gross Profit"][Row_Index]) is float) and (np.isnan(Regression_DF_1["Gross Profit"][Row_Index]) == False)):
        if((type(Regression_DF_1["Operating Income"][Row_Index]) is float) and (np.isnan(Regression_DF_1["Operating Income"][Row_Index]) == False)):
            
            Total_Rev = Regression_DF_1["Total Revenue"][Row_Index]
            
            Total_Pre_Tax_Costs = Total_Rev - Regression_DF_1["Pretax Income"][Row_Index]
            Total_Other = Regression_DF_1["Operating Income"][Row_Index] - Regression_DF_1["Pretax Income"][Row_Index]
            Total_Operating_Expenses = Regression_DF_1["Gross Profit"][Row_Index] - Regression_DF_1["Operating Income"][Row_Index]
            Total_Cost_Of_Revenue = Total_Rev - Regression_DF_1["Gross Profit"][Row_Index]
            
            Cost_Of_Revenue_Proportion_List.append((Total_Cost_Of_Revenue/Total_Pre_Tax_Costs))
            
            Operating_Expenses_Proportion_List.append((Total_Operating_Expenses/Total_Pre_Tax_Costs))
            
            Other_Proportion_List.append((Total_Other/Total_Pre_Tax_Costs))
            
    Row_Index += 1

In [None]:
# This code block checks how many rows had values for gross profit and operating expense
# All 3 printed values should be equal
# To check this, de-commentify the print stateents below

#print(len(Cost_Of_Revenue_Proportion_List))
#print(len(Operating_Expenses_Proportion_List))
#print(len(Other_Proportion_List))

In [None]:
# This code block calculates the average of each cost/expense category as a percent of total costs/expenses
# To view each proportion de-commentify the print statements

Average_Cost_Of_Revenue_Proportion = (sum(Cost_Of_Revenue_Proportion_List)/len(Cost_Of_Revenue_Proportion_List))
#print(Average_Cost_Of_Revenue_Proportion)
Average_Operating_Expenses_Proportion = (sum(Operating_Expenses_Proportion_List)/len(Operating_Expenses_Proportion_List))
#print(Average_Operating_Expenses_Proportion)
Average_Other_Proportion = (sum(Other_Proportion_List)/len(Other_Proportion_List))
#print(Average_Other_Proportion)

In [None]:
# This code block checks to make sure the average proportions add up to approximately 1
# To check this sum, and make sure it is approximately 1, de-commentify the print statement below

#print((Average_Cost_Of_Revenue_Proportion + Average_Operating_Expenses_Proportion + Average_Other_Proportion))

In [None]:
# This code block fills in pseudo-values for "Gross Profit" and "Operating Income" where those are missing

Row_Index = 0

while(Row_Index < len(Regression_DF_1)):
    
    Total_Pre_Tax_Cost = Regression_DF_1["Total Revenue"][Row_Index] - Regression_DF_1["Pretax Income"][Row_Index]
    
    if((type(Regression_DF_1["Gross Profit"][Row_Index]) is str) or (np.isnan(Regression_DF_1["Gross Profit"][Row_Index]) == True)):
        
        Pseudo_Cost_Of_Revenue = (Total_Pre_Tax_Cost * Average_Cost_Of_Revenue_Proportion)
        Regression_DF_1["Gross Profit"][Row_Index] = Regression_DF_1["Total Revenue"][Row_Index] - Pseudo_Cost_Of_Revenue
        
    if((type(Regression_DF_1["Operating Income"][Row_Index]) is str) or (np.isnan(Regression_DF_1["Operating Income"][Row_Index]) == True)):
        
        Pseudo_Operating_Expenses = (Total_Pre_Tax_Cost * Average_Operating_Expenses_Proportion)
        Regression_DF_1["Operating Income"][Row_Index] = Regression_DF_1["Gross Profit"][Row_Index] - Pseudo_Operating_Expenses
        
    Row_Index += 1

In [None]:
# To view the dataframe, de-commentify the code line below

#display(Regression_DF_1)

In [None]:
# To view dataframe column and value type information, de-commentify the code line below
# If some columns have non-numeric objects data types, they must be converted to numeric

#Regression_DF_1.info()

In [None]:
# This code block converts object variables to floats

Regression_1_Columns = list(Regression_DF_1.columns)

Column_Index = Regression_1_Columns.index("Total Revenue")

while(Column_Index < len(Regression_1_Columns)):
    
    Column = Regression_1_Columns[Column_Index]
    Regression_DF_1[Column] = pd.to_numeric(Regression_DF_1[Column], errors='coerce')
    
    Column_Index += 1

In [None]:
# To view dataframe column and value type information, de-commentify the code line below
# All columns should now have numeric data types

#Regression_DF_1.info()

In [None]:
# To view correlation strength between variables, de-commentify the data frame below

#Regression_DF_1.corr()

In [None]:
# To view agegate stats for each column, de-commentify the data frame below

#Regression_DF_1.describe()

In [None]:
# This section of code blocks begins to create linear regression models
# The following 5 code blocks are for first linear regression

In [None]:
# This code block identifies the features and target for a linear regression
# This code block also splits the data into test and train segments

# Features
X_1 = Regression_DF_1.loc[:, 'Weight':'Net Income Common Stockholders']

# Target
Y_1 = Regression_DF_1['Percent Increase']

# Split into test and train data (80% Train Data)
X_Train_1, X_Test_1, Y_Train_1, Y_Test_1 = train_test_split(X_1, Y_1, test_size=0.2, random_state=1)

In [None]:
# This code block trains and scores a linear regression model on the training segment of the data
# It is scored using the R Squared metric

# Create an empty model
lr_1 = LinearRegression()

# Fit on train
lr_1.fit(X_Train_1, Y_Train_1)

# Score on train
lr_1.score(X_Train_1, Y_Train_1)

In [None]:
# This code block scores the model on the test data

# Score on test
lr_1.score(X_Test_1, Y_Test_1)

In [None]:
# To view the coeficient array for this regression, de-commentify the line of code below

#lr_1.coef_

In [None]:
# To view the Y intercept array for this regression, de-commentify the line of code below

#lr_1.intercept_

In [None]:
# This section of code blocks use the same feature columns and target as the first linear regression
# This next regression however allows for polynomial features
# This allows for a potentially better R-Squared score, but increases the risk of overfitting to the train data
# Regularization can help with the overfitting problem

In [None]:
# This code block introduces polynomial features 

pf_1 = PolynomialFeatures(degree=2)

pf_1.fit(X_1)

X_1_Transformed = pf_1.transform(X_1)

In [None]:
# This code block performs a test-train split

X_1_Transformed_Train, X_1_Transformed_Test, Y_Train_1_PF, Y_Test_1_PF = train_test_split(X_1_Transformed, Y_1, test_size=0.2, random_state=2)

In [None]:
# This code block fits and scores a polynomial regression on the train data

lr_1_Transformed = LinearRegression()

lr_1_Transformed.fit(X_1_Transformed_Train, Y_Train_1_PF)

lr_1_Transformed.score(X_1_Transformed_Train, Y_Train_1_PF)

In [None]:
# This code block scores the polynomial regression on the test data

lr_1_Transformed.score(X_1_Transformed_Test, Y_Test_1_PF)

# Non-regularized polynomial features fails to produce a better test R^2 score

In [None]:
# The next few code blocks apply regularization to the polynomial regression

In [None]:
# This code block ensures (using normalization) that data columns of the features are on the are same scale
# The normalized values of a column should have a mean of 0 and standard deviation of 1

std = StandardScaler()
std.fit(X_1_Transformed)
X_1_Transformed_Train_Scaled = std.transform(X_1_Transformed_Train)
X_1_Transformed_Test_Scaled = std.transform(X_1_Transformed_Test)

In [None]:
# The next 3 code blocks apply Ridge Regularization

In [None]:
# This code block finds a good value for the Alpha parameter for Ridge regularization
# To view the progress of optimizing alpha values, de-commentify the print statements

Alpha = 1
Prior_Alpha = 0
Best_Alpha_Passed = False
Iterations = 0

Current_Test_Score = 0
Prior_Test_Score = 0

#print("Alpha   |      Test Score")
#print("________|________________")

while((Iterations < 2) or (Best_Alpha_Passed == False)):
    
    if(Iterations > 0):
        Prior_Test_Score = Current_Test_Score
        Prior_Alpha = Alpha
        Alpha = (2 * Alpha)

    lr_1_Transformed_Ridge = Ridge(alpha = Alpha)
    lr_1_Transformed_Ridge.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Train_Score = lr_1_Transformed_Ridge.score(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    
    Current_Test_Score = lr_1_Transformed_Ridge.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    #print(Alpha, ":         ", Current_Test_Score)
    
    if(Iterations > 0):
        if(Current_Test_Score < Prior_Test_Score):
            Best_Alpha_Passed = True    
            
    Iterations += 1
    
Low_Alpha = max(0, ((Prior_Alpha/2) - 1))
High_Alpha = Alpha
Mid_Alpha = ((Low_Alpha + High_Alpha)/2)
Increment = (High_Alpha - Mid_Alpha)

while(Increment > 0.01):
    
    lr_1_Transformed_Ridge = Ridge(alpha = Low_Alpha)
    lr_1_Transformed_Ridge.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Low_Alpha_Test_Score = lr_1_Transformed_Ridge.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    lr_1_Transformed_Ridge = Ridge(alpha = Mid_Alpha)
    lr_1_Transformed_Ridge.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Mid_Alpha_Test_Score = lr_1_Transformed_Ridge.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    #print(Mid_Alpha, ":         ", Mid_Alpha_Test_Score)
    
    lr_1_Transformed_Ridge = Ridge(alpha = High_Alpha)
    lr_1_Transformed_Ridge.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    High_Alpha_Test_Score = lr_1_Transformed_Ridge.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    if(High_Alpha_Test_Score > Low_Alpha_Test_Score):
        if(High_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha += Increment
    elif(High_Alpha_Test_Score < Low_Alpha_Test_Score):
        if(Low_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha -= Increment
    else:
        Increment = (Increment/2)
        
    Low_Alpha = (Mid_Alpha - Increment)
    High_Alpha = (Mid_Alpha + Increment)
    
Ridge_Alpha_1 = Mid_Alpha
#print("________________________________")
#print("Final Ridge Alpha:", Ridge_Alpha_1)

In [None]:
# This code block uses the Alpha parameter just found to apply Ridge regularization

lr_1_Transformed_Ridge = Ridge(alpha = Ridge_Alpha_1)
lr_1_Transformed_Ridge.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
lr_1_Transformed_Ridge.score(X_1_Transformed_Train_Scaled, Y_Train_1_PF)

In [None]:
# This code block scores the regularized model on the test data

lr_1_Transformed_Ridge.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)

In [None]:
# The next 3 code blocks apply Lasso Regularization

In [None]:
# This code block finds a good value for the Alpha parameter for Lasso regularization
# To view the progress of optimizing alpha values, de-commentify the print statements

Alpha = 1
Prior_Alpha = 0
Best_Alpha_Passed = False
Iterations = 0

Current_Test_Score = 0
Prior_Test_Score = 0

#print("Alpha   |      Test Score")
#print("________|________________")

while((Iterations < 2) or (Best_Alpha_Passed == False)):
    
    if(Iterations > 0):
        Prior_Test_Score = Current_Test_Score
        Prior_Alpha = Alpha
        Alpha = (2 * Alpha)

    lr_1_Transformed_Lasso = Lasso(alpha = Alpha)
    lr_1_Transformed_Lasso.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Train_Score = lr_1_Transformed_Lasso.score(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    
    Current_Test_Score = lr_1_Transformed_Lasso.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    #print(Alpha, ":         ", Current_Test_Score)
    
    if(Iterations > 0):
        if(Current_Test_Score < Prior_Test_Score):
            Best_Alpha_Passed = True    
            
    Iterations += 1
    
Low_Alpha = max(0, ((Prior_Alpha/2) - 1))
High_Alpha = Alpha
Mid_Alpha = ((Low_Alpha + High_Alpha)/2)
Increment = (High_Alpha - Mid_Alpha)

while(Increment > 0.01):
    
    lr_1_Transformed_Lasso = Lasso(alpha = Low_Alpha)
    lr_1_Transformed_Lasso.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Low_Alpha_Test_Score = lr_1_Transformed_Lasso.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    lr_1_Transformed_Lasso = Lasso(alpha = Mid_Alpha)
    lr_1_Transformed_Lasso.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    Mid_Alpha_Test_Score = lr_1_Transformed_Lasso.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    #print(Mid_Alpha, ":         ", Mid_Alpha_Test_Score)
    
    lr_1_Transformed_Lasso = Lasso(alpha = High_Alpha)
    lr_1_Transformed_Lasso.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
    High_Alpha_Test_Score = lr_1_Transformed_Lasso.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)
    
    if(High_Alpha_Test_Score > Low_Alpha_Test_Score):
        if(High_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha += Increment
    elif(High_Alpha_Test_Score < Low_Alpha_Test_Score):
        if(Low_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha -= Increment
    else:
        Increment = (Increment/2)
        
    Low_Alpha = (Mid_Alpha - Increment)
    High_Alpha = (Mid_Alpha + Increment)
    

Lasso_Alpha_1 = Mid_Alpha
#print("________________________________")
#print("Final Ridge Alpha:", Lasso_Alpha_1)

In [None]:
# This code block uses the Alpha parameter just found to apply Ridge regularization

lr_1_Transformed_Lasso = Lasso(alpha = Lasso_Alpha_1)
lr_1_Transformed_Lasso.fit(X_1_Transformed_Train_Scaled, Y_Train_1_PF)
lr_1_Transformed_Lasso.score(X_1_Transformed_Train_Scaled, Y_Train_1_PF)

In [None]:
# This code block scores the regularized model on the test data

lr_1_Transformed_Lasso.score(X_1_Transformed_Test_Scaled, Y_Test_1_PF)

In [None]:
# This section of code blocks performs regression analysis on only the financial data
# Columns indicating (by one hot encoding) which industry or sector a company is in are excluded

In [None]:
# This code block creates a second dataframe with only the financial data and S&P weight

Regression_DF_2 = Regression_DF_1.filter(['Weight', 'Total Revenue', 'Gross Profit', 'Operating Income',
                                          'Pretax Income', 'Net Income Common Stockholders', 
                                          'Percent Increase']).copy()

#display(Regression_DF_2)

In [None]:
# Second Linear Regression without category variables

X_2 = Regression_DF_2.loc[:, 'Weight':'Net Income Common Stockholders']

Y_2 = Regression_DF_2['Percent Increase']

X_Train_2, X_Test_2, Y_Train_2, Y_Test_2 = train_test_split(X_2, Y_2, test_size=0.2, random_state=3)

In [None]:
# Second Linear Regression without category variables, even worse when category data is removed

lr_2 = LinearRegression()

lr_2.fit(X_Train_2, Y_Train_2)

lr_2.score(X_Train_2, Y_Train_2)

In [None]:
# Second Test, even worse when category data is removed

lr_2.score(X_Test_2, Y_Test_2)

In [None]:
# To view the coeficient array for this regression, de-commentify the line of code below

#lr_2.coef_

In [None]:
# To view the Y intercept array for this regression, de-commentify the line of code below

#lr_2.intercept_

In [None]:
# This code block applies polynomial features to the second dataframe/feature set

pf_2 = PolynomialFeatures(degree=2)

pf_2.fit(X_2)

X_2_Transformed = pf_2.transform(X_2)

In [None]:
# This code block performs a test train split

X_2_Transformed_Train, X_2_Transformed_Test, Y_Train_2_PF, Y_Test_2_PF = train_test_split(X_2_Transformed, Y_2, test_size=0.2, random_state=4)

In [None]:
# This code block fits and scores a regression on the second polynomial feature set train data

lr_2_Transformed = LinearRegression()

lr_2_Transformed.fit(X_2_Transformed_Train, Y_Train_2_PF)

lr_2_Transformed.score(X_2_Transformed_Train, Y_Train_2_PF)

In [None]:
# This code block scores the regression on the test data (prior to regularization)

lr_2_Transformed.score(X_2_Transformed_Test, Y_Test_2_PF)

In [None]:
# This section of code blocks applies regularization

In [None]:
# This code block scales the features to have a mean of 0 and standard deviation of 1

std = StandardScaler()
std.fit(X_2_Transformed)
X_2_Transformed_Train_Scaled = std.transform(X_2_Transformed_Train)
X_2_Transformed_Test_Scaled = std.transform(X_2_Transformed_Test)

In [None]:
# Ridge Regularization

In [None]:
# This code block finds a good value for the Alpha parameter for Ridge regularization
# To view the progress of optimizing alpha values, de-commentify the print statements

Alpha = 1
Prior_Alpha = 0
Best_Alpha_Passed = False
Iterations = 0

Current_Test_Score = 0
Prior_Test_Score = 0

#print("Alpha   |      Test Score")
#print("________|________________")

while((Iterations < 2) or (Best_Alpha_Passed == False)):
    
    if(Iterations > 0):
        Prior_Test_Score = Current_Test_Score
        Prior_Alpha = Alpha
        Alpha = (2 * Alpha)

    lr_2_Transformed_Ridge = Ridge(alpha = Alpha)
    lr_2_Transformed_Ridge.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Train_Score = lr_2_Transformed_Ridge.score(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    
    Current_Test_Score = lr_2_Transformed_Ridge.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    #print(Alpha, ":         ", Current_Test_Score)
    
    if(Iterations > 0):
        if(Current_Test_Score < Prior_Test_Score):
            Best_Alpha_Passed = True    
            
    Iterations += 1
    
Low_Alpha = max(0, ((Prior_Alpha/2) - 1))
High_Alpha = Alpha
Mid_Alpha = ((Low_Alpha + High_Alpha)/2)
Increment = (High_Alpha - Mid_Alpha)

while(Increment > 0.01):
    
    lr_2_Transformed_Ridge = Ridge(alpha = Low_Alpha)
    lr_2_Transformed_Ridge.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Low_Alpha_Test_Score = lr_2_Transformed_Ridge.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    lr_2_Transformed_Ridge = Ridge(alpha = Mid_Alpha)
    lr_2_Transformed_Ridge.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Mid_Alpha_Test_Score = lr_2_Transformed_Ridge.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    #print(Mid_Alpha, ":         ", Mid_Alpha_Test_Score)
    
    lr_2_Transformed_Ridge = Ridge(alpha = High_Alpha)
    lr_2_Transformed_Ridge.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    High_Alpha_Test_Score = lr_2_Transformed_Ridge.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    if(High_Alpha_Test_Score > Low_Alpha_Test_Score):
        if(High_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha += Increment
    elif(High_Alpha_Test_Score < Low_Alpha_Test_Score):
        if(Low_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha -= Increment
    else:
        Increment = (Increment/2)
        
    Low_Alpha = (Mid_Alpha - Increment)
    High_Alpha = (Mid_Alpha + Increment)
    
Ridge_Alpha_2 = Mid_Alpha
#print("________________________________")
#print("Final Ridge Alpha:", Ridge_Alpha_2)

In [None]:
# This code block uses the Alpha parameter just found to apply Ridge regularization

lr_2_Transformed_Ridge = Ridge(alpha = Ridge_Alpha_2)
lr_2_Transformed_Ridge.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
lr_2_Transformed_Ridge.score(X_2_Transformed_Train_Scaled, Y_Train_2_PF)

In [None]:
# This code block scores the regularized model on the test data

lr_2_Transformed_Ridge.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)

In [None]:
# Lasso Regularization

In [None]:
# This code block finds a good value for the Alpha parameter for Lasso regularization
# To view the progress of optimizing alpha values, de-commentify the print statements

Alpha = 1
Prior_Alpha = 0
Best_Alpha_Passed = False
Iterations = 0

Current_Test_Score = 0
Prior_Test_Score = 0

#print("Alpha   |      Test Score")
#print("________|________________")

while((Iterations < 2) or (Best_Alpha_Passed == False)):
    
    if(Iterations > 0):
        Prior_Test_Score = Current_Test_Score
        Prior_Alpha = Alpha
        Alpha = (2 * Alpha)

    lr_2_Transformed_Lasso = Lasso(alpha = Alpha)
    lr_2_Transformed_Lasso.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Train_Score = lr_2_Transformed_Lasso.score(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    
    Current_Test_Score = lr_2_Transformed_Lasso.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    #print(Alpha, ":         ", Current_Test_Score)
    
    if(Iterations > 0):
        if(Current_Test_Score < Prior_Test_Score):
            Best_Alpha_Passed = True    
            
    Iterations += 1
    
Low_Alpha = max(0, ((Prior_Alpha/2) - 1))
High_Alpha = Alpha
Mid_Alpha = ((Low_Alpha + High_Alpha)/2)
Increment = (High_Alpha - Mid_Alpha)

while(Increment > 0.01):
    
    lr_2_Transformed_Lasso = Lasso(alpha = Low_Alpha)
    lr_2_Transformed_Lasso.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Low_Alpha_Test_Score = lr_2_Transformed_Lasso.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    lr_2_Transformed_Lasso = Lasso(alpha = Mid_Alpha)
    lr_2_Transformed_Lasso.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    Mid_Alpha_Test_Score = lr_2_Transformed_Lasso.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    #print(Mid_Alpha, ":         ", Mid_Alpha_Test_Score)
    
    lr_2_Transformed_Lasso = Lasso(alpha = High_Alpha)
    lr_2_Transformed_Lasso.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
    High_Alpha_Test_Score = lr_2_Transformed_Lasso.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)
    
    if(High_Alpha_Test_Score > Low_Alpha_Test_Score):
        if(High_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha += Increment
    elif(High_Alpha_Test_Score < Low_Alpha_Test_Score):
        if(Low_Alpha_Test_Score <= Mid_Alpha_Test_Score):
            Increment = (Increment/2)
        Mid_Alpha -= Increment
    else:
        Increment = (Increment/2)
        
    Low_Alpha = (Mid_Alpha - Increment)
    High_Alpha = (Mid_Alpha + Increment)
    
Lasso_Alpha_2 = Mid_Alpha
#print("________________________________")
#print("Final Lasso Alpha:", Lasso_Alpha_2)

In [None]:
# This code block uses the Alpha parameter just found to apply Lasso regularization

lr_2_Transformed_Lasso = Lasso(alpha = Lasso_Alpha_2)
lr_2_Transformed_Lasso.fit(X_2_Transformed_Train_Scaled, Y_Train_2_PF)
lr_2_Transformed_Lasso.score(X_2_Transformed_Train_Scaled, Y_Train_2_PF)

In [None]:
# This code block scoes the regularizaed model on the test data

lr_2_Transformed_Lasso.score(X_2_Transformed_Test_Scaled, Y_Test_2_PF)

In [None]:
# End of Notebook