In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV,ElasticNetCV
from sklearn.metrics import classification_report, f1_score

In [5]:
data = pd.read_csv("Energy_Efficiency_Overfit_Dataset_Updated.csv")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Wall_Area                 200 non-null    float64
 1   Roof_Area                 200 non-null    float64
 2   Window_Area               200 non-null    float64
 3   Overall_Height            200 non-null    float64
 4   Outdoor_Temperature       200 non-null    float64
 5   Humidity                  200 non-null    float64
 6   Energy_Efficiency_Rating  200 non-null    float64
 7   Noise_Feature_1           200 non-null    float64
 8   Noise_Feature_2           200 non-null    float64
 9   Noise_Feature_3           200 non-null    float64
 10  Noise_Feature_4           200 non-null    float64
 11  Noise_Feature_5           200 non-null    float64
 12  Noise_Feature_6           200 non-null    float64
 13  Noise_Feature_7           200 non-null    float64
 14  Noise_Feat

In [7]:
#Calculate and print the median
rating_split = data['Energy_Efficiency_Rating'].median()

In [8]:
print("Threshold(Median):", rating_split)

Threshold(Median): 146.310167005691


In [10]:
data["Target"] =data['Energy_Efficiency_Rating'].apply(lambda x:1 if x>rating_split else 0)
data.drop(columns= ["Energy_Efficiency_Rating"],inplace= True)
data.head()

Unnamed: 0,Wall_Area,Roof_Area,Window_Area,Overall_Height,Outdoor_Temperature,Humidity,Noise_Feature_1,Noise_Feature_2,Noise_Feature_3,Noise_Feature_4,...,Noise_Feature_9,Noise_Feature_10,Orientation_East,Orientation_North,Orientation_South,Orientation_West,Glazing_Type_Type_A,Glazing_Type_Type_B,Glazing_Type_Type_C,Target
0,388.202617,188.924545,44.013461,3.449571,13.102177,51.125928,0.989088,0.904051,0.791454,0.339118,...,0.288525,0.518964,0,1,0,0,0,1,0,1
1,320.00786,192.818625,38.84103,5.417319,14.283884,53.690759,0.818101,0.03522,0.623867,0.277418,...,0.571153,0.052572,1,0,0,0,0,0,1,1
2,348.936899,232.989788,57.666632,4.055632,8.869296,48.129687,0.340605,0.180661,0.439745,0.96228,...,0.914694,0.682054,0,0,1,0,0,0,1,1
3,412.04466,219.657912,53.562928,5.238103,9.242672,50.771139,0.152047,0.338514,0.010586,0.352407,...,0.738639,0.268888,0,0,1,0,0,1,0,1
4,393.3779,219.203946,32.314615,3.594037,13.584071,79.116164,0.784059,0.577496,0.964928,0.894173,...,0.856666,0.106768,0,1,0,0,0,1,0,1


In [11]:
data.Target.value_counts()

Target
1    100
0    100
Name: count, dtype: int64

Splitting the Dataset into Train and Test Sets

In [12]:
X = data.drop('Target', axis =1)
y= data['Target']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.3 , random_state=25)

In [15]:
# we would like to select random_state in which the difference between the mean of  y_train and y_test is minimal
#Experiment with the values of random_state
y_train.mean(), y_test.mean()

(np.float64(0.4928571428571429), np.float64(0.5166666666666667))

TRAINING and Evaluating the BASE Model (No Regularization)

In [18]:
logistic_no_reg = LogisticRegression(penalty=None, max_iter= 10000, n_jobs= -1)

logistic_no_reg.fit(X_train, y_train)

#keeping the value of max_iter as 10000, to avoid the convergence warning

Inspecting model coefficients

In [20]:
logistic_no_reg.coef_

array([[   365.05976322,     90.77115803,    374.88656729,
         -2514.35005559,   -262.92499461,    130.3277157 ,
         -4247.14303835,  -2280.2309136 ,   4214.27260119,
         -5983.33373246,  -1258.75612501,   3861.6609933 ,
         -9184.80542892,  12770.31347226,  -9833.89273445,
         -9087.35292017, -16181.05296823, -16424.60148426,
        -18359.55089936, -30323.82907772, -27231.99578965,
        -27006.96720901, -27050.07143047]])

Model Performance

In [22]:
y_train_pred_no_reg = logistic_no_reg.predict(X_train)
f1_no_reg = f1_score(y_train, y_train_pred_no_reg)
print("F1 score without regularization on train set:",f1_no_reg)

F1 score without regularization on train set: 1.0


In [24]:
# evaluate model wihtout regularization for test data
y_test_pred_no_reg = logistic_no_reg.predict(X_test)
f1_no_reg= f1_score(y_test,y_test_pred_no_reg)
print ("F1 score without Regularization on test set:",f1_no_reg)

F1 score without Regularization on test set: 0.8214285714285714


Apply Logistic Regression with L1 Regularization

In [25]:
# defining a range 
Cs = np.linspace(0.001,10,20)

In [26]:
logistic_l1_cv = LogisticRegressionCV(Cs=Cs, penalty ='l1',
                                      solver='liblinear',
                                      cv= 5,
                                      max_iter =10000,
                                      scoring= 'f1',
                                      n_jobs =-1)
logistic_l1_cv.fit(X_train, y_train)

In [27]:
# diff values of c

logistic_l1_cv.C_

array([2.63231579])

Model performance

In [28]:
y_train_pred_l1 = logistic_l1_cv.predict(X_train)
f1_l1 = f1_score(y_train ,y_train_pred_l1)
print("F1 score with L1 regularization on train set:", f1_l1)

F1 score with L1 regularization on train set: 0.9705882352941176


In [29]:
y_test_pred_l1 = logistic_l1_cv.predict(X_test)
f1_l1 = f1_score(y_test ,y_test_pred_l1)
print("F1 score with L1 regularization on test set:", f1_l1)

F1 score with L1 regularization on test set: 0.8813559322033898


In [None]:
# with l1 regularization overfitting has reduced .

In [30]:
coefficients_after_l1 = pd.DataFrame({
    'Feature' : X_train.columns,
    'After L1 regularization': logistic_l1_cv.coef_.flatten()
})
print(coefficients_after_l1.sort_values(by = 'After L1 regularization'))

                Feature  After L1 regularization
19     Orientation_West                -2.252550
7       Noise_Feature_2                -1.181863
14      Noise_Feature_9                -0.824575
6       Noise_Feature_1                -0.473168
3        Overall_Height                -0.394932
21  Glazing_Type_Type_B                -0.362118
20  Glazing_Type_Type_A                -0.181391
16     Orientation_East                -0.162036
4   Outdoor_Temperature                -0.074879
5              Humidity                -0.036870
17    Orientation_North                 0.000000
18    Orientation_South                 0.000000
15     Noise_Feature_10                 0.000000
11      Noise_Feature_6                 0.000000
9       Noise_Feature_4                 0.000000
12      Noise_Feature_7                 0.000000
22  Glazing_Type_Type_C                 0.000000
1             Roof_Area                 0.033674
2           Window_Area                 0.036375
0             Wall_A

In [31]:
# identify features with 0 from L1 model 
zero_mask = (logistic_l1_cv.coef_ ==0).flatten()
zero_features = X_train.columns[zero_mask]
zero_features.tolist()

['Noise_Feature_4',
 'Noise_Feature_6',
 'Noise_Feature_7',
 'Noise_Feature_10',
 'Orientation_North',
 'Orientation_South',
 'Glazing_Type_Type_C']

L2 regularization

In [32]:
Cs = np.linspace(0.001,10,20)

In [33]:
logistic_l2_cv = LogisticRegressionCV(Cs=Cs, penalty ='l2',
                                      solver='liblinear',
                                      cv= 5,
                                      max_iter =10000,
                                      n_jobs =-1)
logistic_l2_cv.fit(X_train, y_train)

In [34]:
# diff values of c

logistic_l2_cv.C_

array([5.78989474])

In [37]:
y_train_pred_l2 = logistic_l2_cv.predict(X_train)
f1_l2 = f1_score(y_train ,y_train_pred_l2)
print("F1 score with L1 regularization on train set:", f1_l2)

F1 score with L1 regularization on train set: 0.9154929577464789


In [36]:
y_test_pred_l2 = logistic_l2_cv.predict(X_test)
f1_l2 = f1_score(y_test ,y_test_pred_l2)
print("F1 score with L1 regularization on test set:", f1_l2)

F1 score with L1 regularization on test set: 0.819672131147541


In [39]:
coefficients_after_l2 = pd.DataFrame({
    'Feature' : X_train.columns,
    'After L2 regularization': logistic_l2_cv.coef_.flatten()
})
print(coefficients_after_l2.sort_values(by = 'After L2 regularization'))

                Feature  After L2 regularization
19     Orientation_West                -2.834166
21  Glazing_Type_Type_B                -2.542158
20  Glazing_Type_Type_A                -2.175287
7       Noise_Feature_2                -2.064920
14      Noise_Feature_9                -2.054501
22  Glazing_Type_Type_C                -1.375020
12      Noise_Feature_7                -1.199869
17    Orientation_North                -1.185290
18    Orientation_South                -1.070170
9       Noise_Feature_4                -1.007244
16     Orientation_East                -1.002839
3        Overall_Height                -0.667471
6       Noise_Feature_1                -0.595946
11      Noise_Feature_6                -0.234241
15     Noise_Feature_10                -0.200750
4   Outdoor_Temperature                -0.092878
5              Humidity                -0.060953
2           Window_Area                 0.010147
1             Roof_Area                 0.015993
0             Wall_A

In [40]:
zero_mask = (logistic_l2_cv.coef_ ==0).flatten()
zero_features = X_train.columns[zero_mask]
zero_features.tolist()

[]

                    ELASTIC NET


In [41]:
Cs= np.linspace(0.001, 10,20)

In [46]:
logistic_en_cv.Cs_

array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])

In [43]:
logistic_en_cv = LogisticRegressionCV(Cs=Cs, penalty ='elasticnet',
                                      solver='saga',
                                      l1_ratios=[0.0001,0.001,0.01,0.05,0.1,0.4,0.5,0.7,0.99,1],
                                      cv= 5,
                                      max_iter =1000000,
                                      n_jobs =-1)
logistic_en_cv.fit(X_train, y_train)

In [45]:
logistic_en_cv.l1_ratio_

# [0.0001] ratio suggested is very close to 0 which means it is drawing us to L2 penalty

array([0.0001])

In [47]:
logistic_en_cv.Cs_

array([1.00000000e-03, 5.27263158e-01, 1.05352632e+00, 1.57978947e+00,
       2.10605263e+00, 2.63231579e+00, 3.15857895e+00, 3.68484211e+00,
       4.21110526e+00, 4.73736842e+00, 5.26363158e+00, 5.78989474e+00,
       6.31615789e+00, 6.84242105e+00, 7.36868421e+00, 7.89494737e+00,
       8.42121053e+00, 8.94747368e+00, 9.47373684e+00, 1.00000000e+01])

In [48]:
logistic_en_cv.C_

array([0.001])

MODEL Performance

In [49]:
#Evaluate the model 

y_train_pred_elastic= logistic_en_cv.predict(X_train)
f1_elastic_train = f1_score(y_train, y_train_pred_elastic)
print("F1 score with Elastic net Regularization on Train set:",f1_elastic_train)

F1 score with Elastic net Regularization on Train set: 0.821917808219178


In [50]:
y_test_pred_elastic= logistic_en_cv.predict(X_test)
f1_elastic_test = f1_score(y_test, y_test_pred_elastic)
print("F1 score with Elastic net Regularization on Train set:",f1_elastic_test)

F1 score with Elastic net Regularization on Train set: 0.7540983606557377


Elastic net did not perform as good as L1 or L2 regularization
F1 score of train is .82 and F1 score of test is 0.75