In [415]:
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import statsmodels.formula.api as stats_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [416]:
dataset = pd.read_csv('datasets/startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [417]:
# we have categorical data, more than 1 possible value, so we need to
# encode them to numerical data, and then one hot encode them to three
# columns. Luckily we don't have to take care of the Dummy Variable Trap
# here, since our LinearRegression library takes care of it.
label_encoder = LabelEncoder()
X[:, 3] = label_encoder.fit_transform(X[:, 3])
one_hot_encoder = OneHotEncoder(categorical_features=[3])
X = one_hot_encoder.fit_transform(X).toarray()

In [418]:
# now we split the data
X_train, X_test,  y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [419]:
# Now we have dominant dependenant variables, our categorical data is
# 0...1 and our other data can become large. Thererfore we do feature
# scaling, each feature will be in the same range. This step is optional
# because our LinearRegression library takes care of it.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [420]:
# now we create a regressor and fit it to our training data
regressor = LinearRegression()
regressor = regressor.fit(X_train, y_train)

# then we predict our dependant variable on the test set
y_prediction = regressor.predict(X_test)

In [421]:
# The previous approach uses all features of the dataset, but what if not
# all feature have a signifigant effect on the prediction. We can use
# backwards elimination (among other) to find which features have a 
# significant effect.

In [422]:
# In order to be able to use stats_model to find the signifigance value
# of each feature (the P value) we need a feature (column) in our feature
# set that represents are x0 variable, which is 1 for the bias we have in
# our linear regression (y = b0*x0 + b1*x1 + ... bn*xn). This x0 variable
# is called the intercept.
# First we have to take care of the Dummy Variable Trap
X = X[:, 1:]
X_appended = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)

In [423]:
# X_optimal will contain only the variables that have a statistical effect
# on the dependenant variable. We'll start with all of them and then work
# backwards to find the significant ones.
X_optimal = X_appended[:, [0, 1, 2, 3, 4, 5]]

signifigance = 0.05
highest_pvalue = 1
columns = ['0', '1', '2', '3', '4', '5']
while highest_pvalue > signifigance:
    # we now initialise the Ordinary Least Squares class, which is the 
    # same formula as the LinearRegression class used above. And we fit
    # it to the provided X_optimal (which contains all the feature)
    # and the labels. We then loop over this to find the set of features
    # that are statistically significant.
    regressor = stats_model.OLS(endog=y, exog=X_optimal).fit()
    highest_pvalue = regressor.pvalues.max()
    highest_pvalue_index = np.where(regressor.pvalues==highest_pvalue)
    if highest_pvalue > signifigance:
        columns.pop(highest_pvalue_index[0][0])
        X_optimal = np.delete(X_optimal, highest_pvalue_index, axis=1)
        
print('Columns with statistical signifigance: {}'.format(columns))
print('Values with statistical signifigance: {}'.format(X_optimal))

Columns with statistical signifigance: ['0', '3']
Values with statistical signifigance: [[  1.00000000e+00   1.65349200e+05]
 [  1.00000000e+00   1.62597700e+05]
 [  1.00000000e+00   1.53441510e+05]
 [  1.00000000e+00   1.44372410e+05]
 [  1.00000000e+00   1.42107340e+05]
 [  1.00000000e+00   1.31876900e+05]
 [  1.00000000e+00   1.34615460e+05]
 [  1.00000000e+00   1.30298130e+05]
 [  1.00000000e+00   1.20542520e+05]
 [  1.00000000e+00   1.23334880e+05]
 [  1.00000000e+00   1.01913080e+05]
 [  1.00000000e+00   1.00671960e+05]
 [  1.00000000e+00   9.38637500e+04]
 [  1.00000000e+00   9.19923900e+04]
 [  1.00000000e+00   1.19943240e+05]
 [  1.00000000e+00   1.14523610e+05]
 [  1.00000000e+00   7.80131100e+04]
 [  1.00000000e+00   9.46571600e+04]
 [  1.00000000e+00   9.17491600e+04]
 [  1.00000000e+00   8.64197000e+04]
 [  1.00000000e+00   7.62538600e+04]
 [  1.00000000e+00   7.83894700e+04]
 [  1.00000000e+00   7.39945600e+04]
 [  1.00000000e+00   6.75325300e+04]
 [  1.00000000e+00   7.7