### Members: Fareed Khan, Asad Rizvi
### MS DATA SCIENCE

Loading libraries

In [113]:
import pandas as pd
import numpy as np

Loading the dataset

In [167]:
# load train and test csv files in test train dataframes
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

printing the shape

In [168]:
print(train.shape, test.shape)

(25000, 11) (10040, 10)


Checking NAN values

In [169]:
# check nan values of all columns
print(train.isna().sum(), '\n', test.isna().sum())

date                                    4942
Usage_kWh                               5047
Lagging_Current_Reactive.Power_kVarh    5051
Leading_Current_Reactive_Power_kVarh    4998
CO2(tCO2)                               5017
Lagging_Current_Power_Factor            5042
Leading_Current_Power_Factor            5079
NSM                                     4952
WeekStatus                              5112
Day_of_week                             5006
Load_Type                                  0
dtype: int64 
 date                                    1991
Usage_kWh                               1991
Lagging_Current_Reactive.Power_kVarh    1984
Leading_Current_Reactive_Power_kVarh    1947
CO2(tCO2)                               1971
Lagging_Current_Power_Factor            1943
Leading_Current_Power_Factor            2070
NSM                                     2037
WeekStatus                              2032
Day_of_week                             1959
dtype: int64


Removing rows which are entirely NAN

In [170]:
# drop those rows contains nan values.
train_without_nan = train.dropna(axis=0, how='any') 
test_without_nan = test.dropna(axis=0, how='any') 

print(train_without_nan.shape, test_without_nan.shape)

(2686, 11) (1024, 10)


Merging data (Train/Test) to fill the missing values by mean/median/mode/backfill

In [171]:
# merge train and test data
merged_data = pd.concat([train, test], axis=0)
print(merged_data.shape)

(35040, 11)


In [172]:
# drop date column from merged_data
merged_data = merged_data.drop(['date'], axis=1)

In [173]:
merged_data.dtypes

Usage_kWh                               float64
Lagging_Current_Reactive.Power_kVarh    float64
Leading_Current_Reactive_Power_kVarh    float64
CO2(tCO2)                               float64
Lagging_Current_Power_Factor            float64
Leading_Current_Power_Factor            float64
NSM                                     float64
WeekStatus                               object
Day_of_week                              object
Load_Type                                object
dtype: object

Filling Numerical Columns using mean and categorical columns using mode

In [112]:
# loop through numerical columns
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns.tolist()
for each in numeric_columns:
    mean_value = merged_data[each].mean()
    merged_data[each].fillna(mean_value, inplace=True)

# remove loadtype column from data
merged_data_X = merged_data.loc[:, merged_data.columns != 'Load_Type']
merged_data_y = merged_data[['Load_Type']]

# loop through non numeric columns
non_numeric_columns = merged_data_X.select_dtypes(exclude=[np.number]).columns.tolist()

for each in non_numeric_columns:
    mode_value = merged_data_X[each].mode()
    merged_data_X[each].fillna(mode_value[0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_X[each].fillna(mode_value[0], inplace=True)


One Hot Encoding

In [41]:
merged_data_X = pd.get_dummies(merged_data_X)
merged_data_X.shape

(35040, 16)

Applying Polynomial Featurs

In [12]:
# polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
merged_data_X_poly = poly.fit_transform(merged_data_X)


merged_data_X_poly = pd.DataFrame(merged_data_X_poly, columns = poly.get_feature_names_out())
# drop first column
merged_data_X_poly = merged_data_X_poly.drop(['1'], axis=1)

print(merged_data_X_poly.shape)

(35040, 152)


In [192]:
# split the dataframe into train and test again back to original
train_X = merged_data_X_poly.iloc[:train.shape[0], :]
test_X = merged_data_X_poly.iloc[train.shape[0]:, :]

print(train_X.shape, test_X.shape)

(25000, 152) (10040, 152)


In [193]:
# attach target variable of same shape as train_X to train_X dataframe
train_X['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]


Finding min max values of columns 

In [207]:
# finding min max of all columns in train.
min_max = {}
for each in train_X.columns:
    min_max[each] = [train_X[each].min(), train_X[each].max()]

converting the dataset to skewed form

In [42]:
# applying scip skewed on each numerical columns of merged_data_X
from scipy.stats import skew
for each in numeric_columns:
    skewed_value = skew(merged_data_X[each])
    if skewed_value > 0.5:
        merged_data_X[each] = np.log1p(merged_data_X[each])

# split the dataframe into train and test again back to original
train_X_with_skewed = merged_data_X.iloc[:train.shape[0], :]
test_X_with_skewed = merged_data_X.iloc[train.shape[0]:, :]

print(train_X_with_skewed.shape, test_X_with_skewed.shape)
# attach target variable of same shape as train_X to train_X dataframe
train_X_with_skewed['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]

In [54]:
# label encode last column of train_X
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_X_with_skewed['Load_Type'] = le.fit_transform(train_X_with_skewed['Load_Type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X_with_skewed['Load_Type'] = le.fit_transform(train_X_with_skewed['Load_Type'])


Best Gradient Boosting

In [60]:
# apply gradient boosting classifier on train_X and predict it using test_X
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=5, n_estimators=150, learning_rate=0.05)
gbc.fit(train_X_with_skewed.iloc[:, :-1], train_X_with_skewed.iloc[:, -1])
y_pred = gbc.predict(test_X_with_skewed.iloc[:, :])

Gradient Boosting and skewed data with median

In [135]:
# loop through numerical columns
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns.tolist()
for each in numeric_columns:
    mean_value = merged_data[each].median()
    merged_data[each].fillna(mean_value, inplace=True)

# remove loadtype column from data
merged_data_X = merged_data.loc[:, merged_data.columns != 'Load_Type']
merged_data_y = merged_data[['Load_Type']]

merged_data_X = pd.get_dummies(merged_data_X)

# loop through non numeric columns
non_numeric_columns = merged_data_X.select_dtypes(exclude=[np.number]).columns.tolist()

for each in non_numeric_columns:
    mode_value = merged_data_X[each].mode()
    merged_data_X[each].fillna(mode_value[0], inplace=True)

# split the dataframe into train and test again back to original
train_X = merged_data_X.iloc[:train.shape[0], :]
test_X = merged_data_X.iloc[train.shape[0]:, :]

# split the dataframe into train and test again back to original
train_X_with_skewed = merged_data_X.iloc[:train.shape[0], :]
test_X_with_skewed = merged_data_X.iloc[train.shape[0]:, :]

# attach target variable of same shape as train_X to train_X dataframe
train_X_with_skewed['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X_with_skewed['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]


In [None]:
# label encode last column of train_X
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_X_with_skewed['Load_Type'] = le.fit_transform(train_X_with_skewed['Load_Type'])

In [102]:
# apply gradient boosting classifier on train_X and predict it using test_X
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(max_depth=10, n_estimators=200, learning_rate=0.1)
gbc.fit(train_X_with_skewed.iloc[:, :-1], train_X_with_skewed.iloc[:, -1])
y_pred = gbc.predict(test_X_with_skewed.iloc[:, :])

In [126]:
y_pred = pd.DataFrame(y_pred)
# rename column from 0 to load_type
y_pred.rename(columns={0: 'Load_Type'}, inplace=True)

# replace the ecoded value with original value of light_type using repalce
y_pred['Load_Type'] = y_pred['Load_Type'].replace({0.0: 'Light_Load', 2.0: 'Medium_Load', 1.0:"Maximum_Load"})

In [127]:
y_pred.to_csv('first_entry_with_stacking_lr_rf_dt_final_gb.csv')

Using Stacking (Final Model:Gradient Boosting while Logistic Regression, Decision Tree and Random Forest are base models)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

estimators = [
('lr', LogisticRegression()),
('dt', DecisionTreeClassifier(max_depth=5)),
('rf', RandomForestClassifier(max_depth=10, n_estimators=150))
]

reg_sr = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier(max_depth=10, n_estimators=200))

reg_sr.fit(train_X_with_skewed.iloc[:, :-1], train_X_with_skewed.iloc[:, -1])

y_pred = reg_sr.predict(test_X_with_skewed.iloc[:, :])

The results of stacking were very bad

Applying HistGradientBoosting Classifier

In [139]:
from sklearn.ensemble import HistGradientBoostingClassifier

gbc = HistGradientBoostingClassifier(max_depth=20, max_iter=200, learning_rate=0.5)
gbc.fit(train_X_with_skewed.iloc[:, :-1], train_X_with_skewed.iloc[:, -1])
y_pred = gbc.predict(test_X_with_skewed.iloc[:, :])

The results of HisGradientBoosting were also very bad

Applying GradientBoosting Classifier with method bfill

In [174]:
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns.tolist()
for each in numeric_columns:
    merged_data[each] = merged_data[each].fillna(method='bfill')

# remove loadtype column from data
merged_data_X = merged_data.loc[:, merged_data.columns != 'Load_Type']
merged_data_y = merged_data[['Load_Type']]

merged_data_X = pd.get_dummies(merged_data_X)

# loop through non numeric columns
non_numeric_columns = merged_data_X.select_dtypes(exclude=[np.number]).columns.tolist()

for each in non_numeric_columns:
    mode_value = merged_data_X[each].mode()
    merged_data_X[each].fillna(mode_value[0], inplace=True)

# split the dataframe into train and test again back to original
train_X = merged_data_X.iloc[:train.shape[0], :]
test_X = merged_data_X.iloc[train.shape[0]:, :]

# split the dataframe into train and test again back to original
train_X_with_skewed = merged_data_X.iloc[:train.shape[0], :]
test_X_with_skewed = merged_data_X.iloc[train.shape[0]:, :]

# attach target variable of same shape as train_X to train_X dataframe
train_X_with_skewed['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X_with_skewed['Load_Type'] = merged_data_y['Load_Type'].iloc[:train.shape[0]]


In [183]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(max_depth=10, n_estimators=150, learning_rate=0.1)
gbc.fit(train_X_with_skewed.iloc[:, :-1], train_X_with_skewed.iloc[:, -1])
y_pred = gbc.predict(test_X_with_skewed.iloc[:, :])

Best results achieved from Gradient Boosting Classifier with filling NAN Values using method = bfill (backward fill) 

Saving output data in CSV Formats

In [185]:
y_pred = pd.DataFrame(y_pred)
# rename column from 0 to load_type
y_pred.rename(columns={0: 'Load_Type'}, inplace=True)

# replace the ecoded value with original value of light_type using repalce
y_pred['Load_Type'] = y_pred['Load_Type'].replace({0.0: 'Light_Load', 2.0: 'Medium_Load', 1.0:"Maximum_Load"})

In [186]:
y_pred.to_csv('entry_with_gb_150_10_forwardfill.csv')