In [1]:
from google.colab import files
uploaded = files.upload()

Saving merged_df_cleaned.csv to merged_df_cleaned.csv


# Data Preparation

In [2]:
import pandas as pd

# Load the dataset
file_path = 'merged_df_cleaned.csv'
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure and content
df.head()

Unnamed: 0,responder_id,birth_year,gender,industry_desc,occupation_desc,organization_size,manage_others,household,years_at_job,metro_or_regional,...,if_no_covid_i_would_have_choice_about_rw,productivity_remote_vs_office,inperson_hours__commuting,inperson_hours__working,inperson_hours__personal_family_time,inperson_hours_domestic_responsibilities,remote_hours_commuting,remote_hours_working,remote_hours_personal_family_time,remote_hours_domestic_responsibilities
0,0,1972,Female,Other,Clerical and administrative,Between 20 and 199,No,Couple with no dependent children,More than 5 years,Regional,...,Strongly disagree,much more productive,2.0,8.0,2.0,2.0,0.5,8.0,3.5,2.0
1,1,1972,Male,Other,Managers,Between 1 and 4,Yes,Couple with dependent children,More than 5 years,Metro,...,Somewhat agree,less productive,2.0,7.0,3.0,3.0,0.0,7.0,3.0,3.0
2,2,1982,Male,Other,Managers,More than 200,Yes,One parent family with dependent children,More than 5 years,Metro,...,Somewhat agree,less productive,6.0,1.0,6.0,5.0,5.0,2.0,7.0,7.0
3,3,1987,Female,"Professional, Scientific and Technical Services",Professionals,Between 20 and 199,No,Couple with dependent children,Between 1 and 5 years,Metro,...,Somewhat agree,same productivity,1.0,9.0,1.0,2.0,0.0,9.0,3.0,2.0
4,4,1991,Male,Other,Managers,Between 5 and 19,Yes,Couple with no dependent children,More than 5 years,Metro,...,Strongly disagree,more productive,1.0,8.0,3.5,2.0,0.0,6.0,4.0,3.0


In [4]:
# Data Cleaning

# Check for missing values
missing_values = df.isnull().sum()

# Summary of data types
data_types = df.dtypes

# Calculate the age of the respondents
df['age'] = 2021 - df['birth_year']

# Convert categorical columns to category type if they aren't already
categorical_columns = ['gender', 'industry_desc', 'occupation_desc', 'organization_size', 'manage_others',
                       'household', 'metro_or_regional', 'rw_percentage_2020', 'org_encouraged_rw',
                       'org_prepared_for_rw', 'rw_is_common_at_org', 'rw_permission_is_attainable',
                       'rw_collaboration_easy', 'if_no_covid_employer_encourage_rw',
                       'if_no_covid_employer_support_rw', 'if_no_covid_i_would_have_choice_about_rw',
                       'productivity_remote_vs_office']

# Adjust the list of categorical columns by removing the missing column
categorical_columns.remove('rw_percentage_2020')

# Convert the remaining categorical columns to category type
df[categorical_columns] = df[categorical_columns].astype('category')

# Re-check for missing values after converting to appropriate types
missing_values_after_conversion = df.isnull().sum()

# Display the first few rows of the cleaned dataframe and missing values summary
df_cleaned = df.head()

df_cleaned, missing_values, missing_values_after_conversion



(   responder_id  birth_year  gender  \
 0             0        1972  Female   
 1             1        1972    Male   
 2             2        1982    Male   
 3             3        1987  Female   
 4             4        1991    Male   
 
                                      industry_desc  \
 0                                            Other   
 1                                            Other   
 2                                            Other   
 3  Professional, Scientific and Technical Services   
 4                                            Other   
 
                 occupation_desc   organization_size manage_others  \
 0  Clerical and administrative   Between 20 and 199            No   
 1                      Managers     Between 1 and 4           Yes   
 2                      Managers       More than 200           Yes   
 3                 Professionals  Between 20 and 199            No   
 4                      Managers    Between 5 and 19           Yes   
 
    

 Implementing the ensemble algorithms, such as the Balanced Random Forest Classifier and Easy Ensemble AdaBoost classifier, to identify which features are most important in predicting productivity_remote_vs_office

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
from sklearn.metrics import classification_report
import numpy as np

# Encode the target variable
le = LabelEncoder()
df['productivity_remote_vs_office_encoded'] = le.fit_transform(df['productivity_remote_vs_office'])

# Prepare the feature set and target variable
X = df.drop(columns=['productivity_remote_vs_office', 'productivity_remote_vs_office_encoded', 'birth_year', 'responder_id'])
y = df['productivity_remote_vs_office_encoded']

# Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(random_state=42, n_estimators=100)
brf.fit(X_train, y_train)

# Train an Easy Ensemble AdaBoost Classifier
ee = EasyEnsembleClassifier(random_state=42, n_estimators=100)
ee.fit(X_train, y_train)

# Predict and evaluate the models
y_pred_brf = brf.predict(X_test)
y_pred_ee = ee.predict(X_test)

report_brf = classification_report(y_test, y_pred_brf)
report_ee = classification_report(y_test, y_pred_ee)

# Extract and sort feature importances for Balanced Random Forest
importances_brf = brf.feature_importances_
sorted_brf_importances = sorted(zip(importances_brf, X.columns), reverse=True)[:7]

# Access and sort feature importances for Easy Ensemble AdaBoost
importances_ee = ee.estimators_[0].steps[-1][1].feature_importances_
sorted_ee_importances = sorted(zip(importances_ee, X.columns), reverse=True)[:7]

# Display the top 7 sorted feature importances for each algorithm
sorted_brf_importances, sorted_ee_importances



  warn(
  warn(
  warn(


([(0.050786925465934406, 'age'),
  (0.03868585048589016, 'remote_hours_personal_family_time'),
  (0.03771901650613855, 'remote_hours_working'),
  (0.03672205306274489, 'inperson_hours__working'),
  (0.0366803630038933, 'inperson_hours__personal_family_time'),
  (0.0348219846005409, 'inperson_hours__commuting'),
  (0.033243753408951505, 'remote_hours_domestic_responsibilities')],
 [(0.08622527427381553, 'preferred_rw_percentage_future_4-5 days'),
  (0.08232702105179819, 'rw_collaboration_easy_Strongly disagree'),
  (0.07906006260626616, 'rw_collaboration_easy_Somewhat disagree'),
  (0.059739873620779094, 'if_no_covid_employer_support_rw_Very likely'),
  (0.05801013503611908, 'inperson_hours__personal_family_time'),
  (0.0568674602116894, 'preferred_rw_percentage_2020_4-5 days'),
  (0.05669063037206387, 'remote_hours_commuting')])

In [28]:
# Install the ace module if you haven't already
!pip install ace --upgrade

# Convert classification reports to DataFrames for display
report_rf_df = pd.DataFrame(classification_report(y_test, y_pred_brf, output_dict=True)).transpose()
report_ab_df = pd.DataFrame(classification_report(y_test, y_pred_ee, output_dict=True)).transpose()

# Add a column to differentiate the algorithms
report_rf_df["Algorithm"] = "Random Forest"
report_ab_df["Algorithm"] = "AdaBoost"

# Combine the reports into one DataFrame for comparison
combined_reports_df = pd.concat([report_rf_df, report_ab_df])

# Display the combined table
combined_reports_df



Unnamed: 0,precision,recall,f1-score,support,Algorithm
0,0.232143,0.386139,0.289963,101.0,Random Forest
1,0.301587,0.308108,0.304813,185.0,Random Forest
2,0.081761,0.393939,0.135417,33.0,Random Forest
3,0.526549,0.351032,0.421239,339.0,Random Forest
4,0.365854,0.241935,0.291262,248.0,Random Forest
accuracy,0.317881,0.317881,0.317881,0.317881,Random Forest
macro avg,0.301579,0.336231,0.288539,906.0,Random Forest
weighted avg,0.387605,0.317881,0.336841,906.0,Random Forest
0,0.203488,0.346535,0.25641,101.0,AdaBoost
1,0.323944,0.248649,0.281346,185.0,AdaBoost


In [15]:
import pandas as pd

# Convert the top 5 sorted importances to DataFrames and add the algorithm type
df_brf_importances = pd.DataFrame(sorted_brf_importances, columns=["Importance", "Feature"])
df_brf_importances["Algorithm"] = "Balanced Random Forest"

df_ee_importances = pd.DataFrame(sorted_ee_importances, columns=["Importance", "Feature"])
df_ee_importances["Algorithm"] = "Easy Ensemble AdaBoost"

# Display the tables
df_brf_importances, df_ee_importances


(   Importance                                 Feature               Algorithm
 0    0.050787                                     age  Balanced Random Forest
 1    0.038686       remote_hours_personal_family_time  Balanced Random Forest
 2    0.037719                    remote_hours_working  Balanced Random Forest
 3    0.036722                 inperson_hours__working  Balanced Random Forest
 4    0.036680    inperson_hours__personal_family_time  Balanced Random Forest
 5    0.034822               inperson_hours__commuting  Balanced Random Forest
 6    0.033244  remote_hours_domestic_responsibilities  Balanced Random Forest,
    Importance                                      Feature  \
 0    0.086225      preferred_rw_percentage_future_4-5 days   
 1    0.082327      rw_collaboration_easy_Strongly disagree   
 2    0.079060      rw_collaboration_easy_Somewhat disagree   
 3    0.059740  if_no_covid_employer_support_rw_Very likely   
 4    0.058010         inperson_hours__personal_fami

In [16]:
df_brf_importances

Unnamed: 0,Importance,Feature,Algorithm
0,0.050787,age,Balanced Random Forest
1,0.038686,remote_hours_personal_family_time,Balanced Random Forest
2,0.037719,remote_hours_working,Balanced Random Forest
3,0.036722,inperson_hours__working,Balanced Random Forest
4,0.03668,inperson_hours__personal_family_time,Balanced Random Forest
5,0.034822,inperson_hours__commuting,Balanced Random Forest
6,0.033244,remote_hours_domestic_responsibilities,Balanced Random Forest


In [17]:
df_ee_importances

Unnamed: 0,Importance,Feature,Algorithm
0,0.086225,preferred_rw_percentage_future_4-5 days,Easy Ensemble AdaBoost
1,0.082327,rw_collaboration_easy_Strongly disagree,Easy Ensemble AdaBoost
2,0.07906,rw_collaboration_easy_Somewhat disagree,Easy Ensemble AdaBoost
3,0.05974,if_no_covid_employer_support_rw_Very likely,Easy Ensemble AdaBoost
4,0.05801,inperson_hours__personal_family_time,Easy Ensemble AdaBoost
5,0.056867,preferred_rw_percentage_2020_4-5 days,Easy Ensemble AdaBoost
6,0.056691,remote_hours_commuting,Easy Ensemble AdaBoost
