In [1]:
import pandas as pd

In [2]:
data_path = "../DataOnly/WaitData.Published.xlsx"
data = pd.read_excel(data_path, sheet_name='F3')

In [3]:
data_cleaned = data.drop([col for col in data.columns if col.startswith('x_')], axis=1)

X = data_cleaned.drop('Wait', axis=1)
y = data_cleaned['Wait']

## Task 5

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

model = LinearRegression()
model.fit(X, y)

y_pred = model.predict(X)
residuals = y - y_pred
error = mean_absolute_error(y, y_pred)

print(error)

23.166288936348092


## Task 6

In [9]:
from sklearn.feature_selection import RFE

best_error = float('inf')
best_features = []

for n_features in range(1, 4):
    rfe = RFE(model, n_features_to_select=n_features)
    X_rfe = rfe.fit_transform(X, y)
    model.fit(X_rfe, y)

    y_pred_rfe = model.predict(rfe.transform(X))
    cols = list(compress( X.columns, rfe.support_))
    error_rfe = mean_absolute_error(y, y_pred_rfe)

    print(f"Error with {n_features} features: {error_rfe}")
    print(*cols)

Error with 1 features: 32.50201098957221
FlowCount2
Error with 2 features: 31.776732528223665
LineCount2 FlowCount2
Error with 3 features: 31.663220627221026
LineCount2 FlowCount2 NumCustomersInLastW1


## Task 7

In [35]:
selected_features = []
remaining_features = list(X.columns)
best_error = float('inf')

for _ in range(3):
    best_feature = None
    for feature in remaining_features:
        current_features = selected_features + [feature]
        model.fit(X[current_features], y)
        y_pred_greedy = model.predict(X[current_features])
        error_greedy = mean_absolute_error(y, y_pred_greedy)

        if error_greedy < best_error:
            best_error = error_greedy
            best_feature = feature

    if best_feature:
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)

print(best_error)
print(selected_features)


28.696490632253543
['DelayedInLine', 'LineCount0Strict', 'LineCount0']


## Task 8

In [36]:
selected_features = []
remaining_features = list(X.columns)
best_error = float('inf')
best_errors = {}

for num_features in range(1, 16):
    best_feature = None
    for feature in remaining_features:
        current_features = selected_features + [feature]
        model.fit(X[current_features], y)
        y_pred_greedy = model.predict(X[current_features])
        error_greedy = mean_absolute_error(y, y_pred_greedy)

        if error_greedy < best_error:
            best_error = error_greedy
            best_feature = feature
    if best_feature:
        selected_features.append(best_feature)
        remaining_features.remove(best_feature)

    best_errors[num_features] = {
        'best_error': best_error,
        'selected_features': list(selected_features)
    }


In [34]:
filtered_best_errors = {num_features: result for num_features, result in best_errors.items() if result['best_error'] < 24}
sorted(filtered_best_errors.items())

[(10,
  {'best_error': 23.846509492065024,
   'selected_features': ['DelayedInLine',
    'LineCount0Strict',
    'LineCount0',
    'SumDelayWaitingInLine',
    'AheadCount',
    'NumCompletedToday',
    'NumScheduledNextW2',
    'DelayCount',
    'InProgressSize',
    'StartTime4']}),
 (11,
  {'best_error': 23.69261939854694,
   'selected_features': ['DelayedInLine',
    'LineCount0Strict',
    'LineCount0',
    'SumDelayWaitingInLine',
    'AheadCount',
    'NumCompletedToday',
    'NumScheduledNextW2',
    'DelayCount',
    'InProgressSize',
    'StartTime4',
    'AvgDelayForDay']}),
 (12,
  {'best_error': 23.627914296582397,
   'selected_features': ['DelayedInLine',
    'LineCount0Strict',
    'LineCount0',
    'SumDelayWaitingInLine',
    'AheadCount',
    'NumCompletedToday',
    'NumScheduledNextW2',
    'DelayCount',
    'InProgressSize',
    'StartTime4',
    'AvgDelayForDay',
    'AvgWaitLastK1Customers']}),
 (13,
  {'best_error': 23.57915967047964,
   'selected_features': ['D

answer is 10

## Task 9

In [37]:
from itertools import combinations
from tqdm.notebook import tqdm

best_error = float('inf')
best_combination = None

for comb in tqdm(list(combinations(X.columns, 3))):
    model.fit(X[list(comb)], y)
    y_pred_brute = model.predict(X[list(comb)])
    error_brute = mean_absolute_error(y, y_pred_brute)

    if error_brute < best_error:
        best_error = error_brute
        best_combination = comb

print(best_error)
print(best_combination)


  0%|          | 0/98770 [00:00<?, ?it/s]

26.072745167091377
('AheadCount', 'SumDelayWaitingByExamCode', 'NumCompletedToday')
