In [1]:
import pandas as pd
import numpy as np

# People might need to pip install pydataset (I had to)
from pydataset import data

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression

Some information about the data we are gathering:

Daily readings of the following air quality values for May 1, 1973 (a Tuesday)
to September 30, 1973.

  * `Ozone`: Mean ozone in parts per billion from 1300 to 1500 hours at Roosevelt Island 

  * `Solar.R`: Solar radiation in Langleys in the frequency band 4000–7700 Angstroms from 0800 to 1200 hours at Central Park 

  * `Wind`: Average wind speed in miles per hour at 0700 and 1000 hours at LaGuardia Airport 

  * `Temp`: Maximum daily temperature in degrees Fahrenheit at La Guardia Airport. 

In [2]:
air_df = data("airquality")

In [3]:
air_df.describe()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
count,116.0,146.0,153.0,153.0,153.0,153.0
mean,42.12931,185.931507,9.957516,77.882353,6.993464,15.803922
std,32.987885,90.058422,3.523001,9.46527,1.416522,8.86452
min,1.0,7.0,1.7,56.0,5.0,1.0
25%,18.0,115.75,7.4,72.0,6.0,8.0
50%,31.5,205.0,9.7,79.0,7.0,16.0
75%,63.25,258.75,11.5,85.0,8.0,23.0
max,168.0,334.0,20.7,97.0,9.0,31.0


Hmm, it looks like quite a bit of data is missing for the Ozone, and some for the Solar.R, let's go ahead and replace the NaNs with the mean data for now: (This may not be the best, perhaps we can later think about replacing NaNs with the values that are close to them in Month/Day instead of just taking the average, plus because there are only 153 rows, the Ozone NaNs might show up in streaks)

In [4]:
air_df = air_df.fillna({"Ozone":air_df.Ozone.mean(), "Solar.R":air_df["Solar.R"].mean()})

In [5]:
air_df.describe()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
count,153.0,153.0,153.0,153.0,153.0,153.0
mean,42.12931,185.931507,9.957516,77.882353,6.993464,15.803922
std,28.693372,87.960267,3.523001,9.46527,1.416522,8.86452
min,1.0,7.0,1.7,56.0,5.0,1.0
25%,21.0,120.0,7.4,72.0,6.0,8.0
50%,42.12931,194.0,9.7,79.0,7.0,16.0
75%,46.0,256.0,11.5,85.0,8.0,23.0
max,168.0,334.0,20.7,97.0,9.0,31.0


In [6]:
def average_regression(seed, verbose=False):
    """
    Perform regression on several models, and return the average of the accuracies
    
    Parameters
    ----------
    seed : int
        The seed to use for train_test_split, for reproducability
    verbose : bool
        Whether or not to print things
        
    Returns
    -------
    avg_acc : float
        The average accuracy of all the models used
    """
    trainX, testX, trainY, testY = train_test_split(air_df.drop("Ozone", axis=1), air_df.Ozone, test_size=.2, train_size=.8, random_state=seed)

    keep = ["Solar.R", "Wind", "Temp"]
    lr = LinearRegression()
    lr.fit(trainX, trainY)
    lr_acc = lr.score(testX, testY)

    lr2 = LinearRegression()
    lr2.fit(trainX[keep], trainY)
    lr_acc2 = lr2.score(testX[keep], testY)

    abr = AdaBoostRegressor()
    abr.fit(trainX, trainY)
    abr_acc = abr.score(testX, testY)

    abr2 = AdaBoostRegressor()
    abr2.fit(trainX[keep], trainY)
    abr_acc2 = abr2.score(testX[keep], testY)
    
    if verbose:
        print("Performing regression using seed:", seed)
        print("Full LR R^2:",lr_acc)
        print("Reduced LR R^2", lr_acc2)
        # ABR uses other randomization, so the same seed can give different accs
        print("Full ABR R^2:", abr_acc)
        print("Reduced ABR R^2", abr_acc2)
    
    return np.mean([lr_acc, lr_acc2, abr_acc, abr_acc2])

In [7]:
N = 1000
seeds = set()
outputs = []
for seed in np.random.choice(10**5, size=N, replace=False):
    avg_acc = average_regression(seed)
    outputs.append((avg_acc, seed))

In [8]:
outputs = sorted(outputs)

In [9]:
# Show the 10 highest seeds and the average R^2 of them
outputs[-10:]

[(0.70017627203675958, 43289),
 (0.70095936157099314, 42382),
 (0.70301289236180131, 50947),
 (0.70382134545448882, 25083),
 (0.70622934137096705, 32385),
 (0.71041521161509036, 22261),
 (0.72412083728991394, 65884),
 (0.72932696106440043, 54264),
 (0.73572329930341573, 76296),
 (0.74622639214672659, 1822)]

In [12]:
average_regression(1822, True)

Performing regression using seed: 1822
Full LR R^2: 0.67125856537
Reduced LR R^2 0.653048669165
Full ABR R^2: 0.837476204751
Reduced ABR R^2 0.844480241302


0.75156592014678258