In [16]:
import pandas as pd
import numpy as np

train_df = pd.read_csv(r'C:/Users/LENOVO/OneDrive/Desktop/ML_Contest/data/train.csv')
test_df = pd.read_csv(r'C:/Users/LENOVO/OneDrive/Desktop/ML_Contest/data/test.csv')
sample_submission = pd.read_csv(r'C:/Users/LENOVO/OneDrive/Desktop/ML_Contest/data/sample_submission.csv')

print(train_df.shape, test_df.shape)
train_df.head()

(14000, 20) (6000, 19)


Unnamed: 0,ID,electricity_kwh_per_month,natural_gas_therms_per_month,vehicle_miles_per_month,house_area_sqft,water_usage_liters_per_day,public_transport_usage_per_week,household_size,home_insulation_quality,meat_consumption_kg_per_week,laundry_loads_per_week,recycles_regularly,composts_organic_waste,uses_solar_panels,energy_efficient_appliances,heating_type,diet_type,owns_pet,smart_thermostat_installed,carbon_footprint
0,0xd6c,759.7,55.95,944.55,2422.07,541.27,1,3,2,4.23,9,1.0,0.0,0,1.0,gas,vegetarian,1,,830.1
1,0x3fdf,387.06,70.59,1280.85,1995.3,280.39,1,2,1,3.27,8,0.0,0.0,0,0.0,electric,vegetarian,0,0.0,963.08
2,0x3b08,594.25,29.14,1005.72,2673.55,416.14,0,2,3,2.87,3,0.0,1.0,0,1.0,electric,omnivore,1,1.0,840.11
3,0x31e5,503.76,74.68,1049.46,2994.28,530.13,0,5,1,3.22,9,1.0,0.0,0,0.0,electric,omnivore,1,0.0,1252.42
4,0x397e,549.54,-77.0,756.49,2549.57,604.1,5,4,4,2.92,2,1.0,0.0,0,1.0,electric,vegetarian,0,1.0,580.74


In [17]:
# Target column
target = 'carbon_footprint'

In [18]:
# Drop ID and target from features
ID_col = 'ID'
X = train_df.drop([ID_col, target], axis=1)
y = train_df[target]

In [19]:
# List of feature types
numerical_features = [
    'electricity_kwh_per_month', 'natural_gas_therms_per_month',
    'vehicle_miles_per_month', 'house_area_sqft', 'water_usage_liters_per_day',
    'public_transport_usage_per_week', 'household_size', 'home_insulation_quality',
    'meat_consumption_kg_per_week', 'laundry_loads_per_week'
]

categorical_features = [
    'recycles_regularly', 'composts_organic_waste', 'uses_solar_panels',
    'energy_efficient_appliances', 'heating_type', 'diet_type',
    'owns_pet', 'smart_thermostat_installed'
]

Creating pipeline

In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [21]:
#Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

In [22]:
#Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [23]:
for col in train_df.columns:
    if train_df[col].dtype == object:
        print(f"{col}:")
        print(train_df[col].unique())
        print()


ID:
['0xd6c' '0x3fdf' '0x3b08' ... '0x3846' '0x4ba4' '0x2dc0']

house_area_sqft:
['2422.07' '1995.3' '2673.55' ... '3357.9' '939.08' '1956.39']

household_size:
['3' '2' '5' '4' '6' '1' '7' '?G_m7' '8' 'u]_o~' '!=GtK' 'n?%W*' '4l]Qf'
 'U8opb' '{d~Ul' '?%W:p' '=w\\b}' 'w}1=^' 'Qc#+L' "+v'<n" 'UYH<}' '&GQD!'
 '/3+J,' 'T}@*m' '9' ',6j1}' ';#G1b' '75@Aa' '&}%ZS' 'cuz\\^' 'AUJ"o'
 'uQ_k+' 'Mmr%u' 'q/IrZ' 'H~c)|' '?cX:}' ']1)q(' 'V5MG0' 'Q{gp@' 'r}\\!v'
 'q0mSo' 'CG`57' 'T%cA[' "+'#p~" '_D/S1' '_mBeM' '&^V7z' 'J%%9N' '28O.o'
 '}L@5{' 'bdvX#' '2KkO2' 'y=NoP' 'adq8.' 'LGoru' 'V5!=T' "I'(P'" 'CW(#w'
 'aJQ/U' '|JxeV' 'SaP}!' 'fi]Ob' '10' '2v.\\O' 'o-<B]' 'R]*K>' 'Y68o5'
 'AR(=~' 'Rnt^Z' '!NX?_' 'H&BPy' 'j!hxR' 'N_uFv' '-VPUo' '=Xn\\e' 'Snt_~'
 '`ybnH' 'KMIqi' 'M5X8w' 'U(+rW' 'HKUV_' 'O;&iz' '5~|L$' '1Ab"^' '$!u[-'
 "ySy'2" 't`{&4' '`|f"(' 'iemn%' 'i$>bP' 'q=T=c' 'lYU7b' '2Mbo=' ',X)dx'
 'u"_23' '.g1/+' 'tnF!J' '[0<-f' 'AWwr;' 'EE|k)' '7:0Qm' "/XXN'" 'J!y(j'
 ':@lIk' '<J2"b' '$qR,$' '@|&A}' 'Or`C

In [24]:
# List of columns expected to be numeric but were detected as object
numeric_object_columns = [
    'house_area_sqft',
    'household_size',
    'electricity_kwh_per_month',
    'natural_gas_therms_per_month',
    'vehicle_miles_per_month',
    'water_usage_liters_per_day',
    'meat_consumption_kg_per_week',
    'laundry_loads_per_week',
    'smart_thermostat_installed'
]

# Convert them to numeric
for col in numeric_object_columns:
    train_df[col] = pd.to_numeric(train_df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')


In [25]:
# Fill NaN with median (or you can use mean)
train_df[numeric_object_columns] = train_df[numeric_object_columns].fillna(train_df[numeric_object_columns].median())
test_df[numeric_object_columns] = test_df[numeric_object_columns].fillna(train_df[numeric_object_columns].median())


Train model


In [27]:
X = train_df.drop(['ID', 'carbon_footprint'], axis=1)
y = train_df['carbon_footprint']
y_pred = pipeline.predict(X)

pipeline.fit(X, y)


In [28]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

y_pred = pipeline.predict(X)

# Evaluate
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")


RMSE: 27.45
MAE: 16.58
MSE: 753.30


In [29]:
# Drop ID from test set before prediction
X_test = test_df.drop('ID', axis=1)
predictions = pipeline.predict(X_test)

# Prepare submission
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'carbon_footprint': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


Submission file saved as 'submission.csv'


In [30]:
import joblib
joblib.dump(pipeline, 'carbon_footprint_model.pkl')


['carbon_footprint_model.pkl']

In [31]:
# Calculate the total variance (SST) of the actual values
sst = np.var(y) * len(y)

# Calculate SSE (Sum of Squared Errors)
sse = mse * len(y)

# Calculate R²
r2 = 1 - (sse / sst)

print(f"R²: {r2:.2f}")


R²: 0.98
