In [1]:
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv("rental_data.csv")

In [3]:
data

Unnamed: 0,ID,Date,y,Hour,Temperature(�C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(�C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,5755,20/11/2018,1317,19,8.7,34,2.2,1857,-6.3,0.0,0.0,0.0,Autumn,No Holiday,Yes
5756,5756,20/11/2018,1078,20,8.5,32,2.6,1789,-7.2,0.0,0.0,0.0,Autumn,No Holiday,Yes
5757,5757,20/11/2018,1020,21,8.8,37,3.8,1750,-5.1,0.0,0.0,0.0,Autumn,No Holiday,Yes
5758,5758,20/11/2018,922,22,8.9,40,3.4,1810,-3.9,0.0,0.0,0.0,Autumn,No Holiday,Yes


In [4]:
le = LabelEncoder()
cat_features = data.select_dtypes(include=['object']).columns
data[cat_features] = data[cat_features].apply(lambda col: le.fit_transform(col))

In [5]:
y = data["y"]
X = data.drop(["ID", "Date", "y"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
def get_feature_combinations(features):
    features = set(features)
    subsets = [[]]
    for elem in features:
        subsets += [subset + [elem] for subset in subsets]
    return subsets[1:]

# Example usage:
my_set = {1, 2, 3}
print(get_feature_combinations(my_set))

[[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3]]


In [7]:
get_feature_combinations(X_train.columns)

[['Rainfall(mm)'],
 ['Wind speed (m/s)'],
 ['Rainfall(mm)', 'Wind speed (m/s)'],
 ['Visibility (10m)'],
 ['Rainfall(mm)', 'Visibility (10m)'],
 ['Wind speed (m/s)', 'Visibility (10m)'],
 ['Rainfall(mm)', 'Wind speed (m/s)', 'Visibility (10m)'],
 ['Holiday'],
 ['Rainfall(mm)', 'Holiday'],
 ['Wind speed (m/s)', 'Holiday'],
 ['Rainfall(mm)', 'Wind speed (m/s)', 'Holiday'],
 ['Visibility (10m)', 'Holiday'],
 ['Rainfall(mm)', 'Visibility (10m)', 'Holiday'],
 ['Wind speed (m/s)', 'Visibility (10m)', 'Holiday'],
 ['Rainfall(mm)', 'Wind speed (m/s)', 'Visibility (10m)', 'Holiday'],
 ['Dew point temperature(�C)'],
 ['Rainfall(mm)', 'Dew point temperature(�C)'],
 ['Wind speed (m/s)', 'Dew point temperature(�C)'],
 ['Rainfall(mm)', 'Wind speed (m/s)', 'Dew point temperature(�C)'],
 ['Visibility (10m)', 'Dew point temperature(�C)'],
 ['Rainfall(mm)', 'Visibility (10m)', 'Dew point temperature(�C)'],
 ['Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(�C)'],
 ['Rainfall(mm)',
  'Wind s

In [8]:
len(get_feature_combinations(X_train.columns))

4095

In [9]:
def feature_grid_search(X_train, y_train, X_test, y_test):
    model = GradientBoostingRegressor()
    features = list(X_train.columns)
    best_score = 0
    best_features = []
    feature_combinations = get_feature_combinations(features)
    count = len(feature_combinations)
    counter = 0
    for combination in feature_combinations:
        model.fit(X_train[combination], y_train)
        counter += 1
        print("Testing combination", counter, "out of", count, "combination.")
        predictions = model.predict(X_test[combination])
        score = r2_score(y_test, predictions)
        if score > best_score:
            best_score = score
            best_features = combination
    return best_features

In [10]:
best_features = feature_grid_search(X_train, y_train, X_test, y_test)

Testing combination 1 out of 4095 combination.
Testing combination 2 out of 4095 combination.
Testing combination 3 out of 4095 combination.
Testing combination 4 out of 4095 combination.
Testing combination 5 out of 4095 combination.
Testing combination 6 out of 4095 combination.
Testing combination 7 out of 4095 combination.
Testing combination 8 out of 4095 combination.
Testing combination 9 out of 4095 combination.
Testing combination 10 out of 4095 combination.
Testing combination 11 out of 4095 combination.
Testing combination 12 out of 4095 combination.
Testing combination 13 out of 4095 combination.
Testing combination 14 out of 4095 combination.
Testing combination 15 out of 4095 combination.
Testing combination 16 out of 4095 combination.
Testing combination 17 out of 4095 combination.
Testing combination 18 out of 4095 combination.
Testing combination 19 out of 4095 combination.
Testing combination 20 out of 4095 combination.
Testing combination 21 out of 4095 combination.
T

In [11]:
best_features

['Rainfall(mm)',
 'Holiday',
 'Dew point temperature(�C)',
 'Functioning Day',
 'Snowfall (cm)',
 'Hour',
 'Seasons',
 'Solar Radiation (MJ/m2)',
 'Temperature(�C)']

## Another Code More Simplified:

In [6]:
def combinations(features):
    features = set(features)
    subsets = [[]]
    for elem in features:
        subsets += [subset + [elem] for subset in subsets]
    return subsets[1:]
combinations(X_train.columns)

[['Holiday'],
 ['Snowfall (cm)'],
 ['Holiday', 'Snowfall (cm)'],
 ['Humidity(%)'],
 ['Holiday', 'Humidity(%)'],
 ['Snowfall (cm)', 'Humidity(%)'],
 ['Holiday', 'Snowfall (cm)', 'Humidity(%)'],
 ['Temperature(�C)'],
 ['Holiday', 'Temperature(�C)'],
 ['Snowfall (cm)', 'Temperature(�C)'],
 ['Holiday', 'Snowfall (cm)', 'Temperature(�C)'],
 ['Humidity(%)', 'Temperature(�C)'],
 ['Holiday', 'Humidity(%)', 'Temperature(�C)'],
 ['Snowfall (cm)', 'Humidity(%)', 'Temperature(�C)'],
 ['Holiday', 'Snowfall (cm)', 'Humidity(%)', 'Temperature(�C)'],
 ['Solar Radiation (MJ/m2)'],
 ['Holiday', 'Solar Radiation (MJ/m2)'],
 ['Snowfall (cm)', 'Solar Radiation (MJ/m2)'],
 ['Holiday', 'Snowfall (cm)', 'Solar Radiation (MJ/m2)'],
 ['Humidity(%)', 'Solar Radiation (MJ/m2)'],
 ['Holiday', 'Humidity(%)', 'Solar Radiation (MJ/m2)'],
 ['Snowfall (cm)', 'Humidity(%)', 'Solar Radiation (MJ/m2)'],
 ['Holiday', 'Snowfall (cm)', 'Humidity(%)', 'Solar Radiation (MJ/m2)'],
 ['Temperature(�C)', 'Solar Radiation (MJ/m2)']

In [7]:
print(len(combinations(X_train.columns)))

4095


In [10]:

def feature_grid_search(X_train, y_train, X_test, y_test):
    model = GradientBoostingRegressor()
    best_score = 0
    best_features = []
    
    # Get all possible subsets of features -->> combinations(X_train.columns)
    
    for combo in combinations(X_train.columns):
        model.fit(X_train[list(combo)], y_train)
        predictions = model.predict(X_test[list(combo)])
        score = r2_score(y_test, predictions)
        
        if score > best_score:
            best_score = score
            best_features = combo
    
    return best_features, best_score


best_features=feature_grid_search(X_train, y_train, X_test, y_test)
print(best_features)



(['Holiday', 'Snowfall (cm)', 'Temperature(�C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Functioning Day', 'Dew point temperature(�C)', 'Hour', 'Seasons'], 0.8526179190509455)


## Assighment :

## Implement K-fold cross validation function :

In [12]:
import random
import numpy as np

def k_fold_cross_validation(data, k=5, seed=42):
    random.seed(seed)
    shuffled_data = data.copy()
    random.shuffle(shuffled_data)
    
    fold_size = len(shuffled_data) // k
    
    scores = []
    
    for i in range(k):
        test_start = i * fold_size
        test_end = (i + 1) * fold_size
        
        test_set = shuffled_data[test_start:test_end]
        
        train_set = [x for x in shuffled_data if x not in test_set]
        
        print(f"Fold {i+1}:")
        
        # Simulating predictions
        predictions = [random.randint(0, 100) for _ in test_set]
        
        # Calculate score (in this case, we're just using mean absolute error)
        mae = sum(abs(x - y) for x, y in zip(test_set, predictions)) / len(test_set)
        scores.append(mae)
    
    return np.mean(scores), np.std(scores)

# Example usage
data = [random.randint(0, 100) for _ in range(100)]  # Generating sample data
mean_score, std_score = k_fold_cross_validation(data)
print("\nK-fold Cross Validation Results:")
print(f"Mean Score: {mean_score}")
print(f"Standard Deviation: {std_score}")


Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:

K-fold Cross Validation Results:
Mean Score: 32.46
Standard Deviation: 4.520110618115446


## AlReady Built in Sciket learn :

In [15]:
import numpy as np
from sklearn.model_selection import KFold



def k_fold_cross_validation(data, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(data), 1):
        X_train, X_val = data[train_index], data[val_index]
        y_train, y_val = np.zeros(len(X_train)), np.zeros(len(X_val))

        print(f"Fold {fold}:")

        # Simulating predictions
        X_pred = np.random.rand(len(X_val))
        
        # Calculate score (in this case, we're just using mean squared error)
        mse = ((X_pred - y_val) ** 2).mean()
        scores.append(mse)

    return np.mean(scores), np.std(scores)


# Example usage
if __name__ == "__main__":
    # Mock dataset (simulating features)
    data = np.random.rand(100)

    mean_score, std_score = k_fold_cross_validation(data)
    
    print(f"\nK-fold Cross Validation Results:")
    print(f"Mean Score: {mean_score}")
    print(f"Standard Deviation: {std_score}")

Fold 1:
Fold 2:
Fold 3:
Fold 4:
Fold 5:

K-fold Cross Validation Results:
Mean Score: 0.3418161001354485
Standard Deviation: 0.04875851650844805
