In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

### Data Extraction

In [2]:
names = ["Abdulbasit", "Aisha", "Ahmed", "Joy", "Ngozi", "Chukwudi", "Fatima", "Segun", "Amaka", "Bola"]
locations = ["Lagos", "Abuja", "Kano", "Onitsha", "Kaduna", "Owerri", "Yaba, Lagos", "Minna, Niger State", "Aba", "Benin City"]
cylinder_sizes = ["5kg", "6kg", "8kg", "10kg", "12.5kg"]
refill_quantities = ["4kg", "5kg", "7kg", "8kg", "12kg"]
meal_types = ["light meals", "heavy meals", "a mix of both"]
household_sizes = ["single-person home", "family of 2", "family of 3", "family of 4", "family of 5", "family of 6"]
price_per_kg = ["₦800", "₦850", "₦900", "₦950", "₦1000"]
refill_frequencies = ["weekly", "bi-weekly", "monthly"]
distances_to_refill = ["1km", "2.5km", "3km", "5km"]
weather_conditions = ["sunny", "rainy", "dry season", "harmattan"]
special_events = ["wedding", "birthday party", "Christmas", "Ramadan celebration", "No special event"]
stove_types = ["single-burner", "double-burner", "electric stove", "gas cooker with oven"]
depletion_cases = ["running out of gas", "refilling on time", "delayed refills"]
shared_cylinder_cases = ["shared with neighbors", "used by multiple households", "individual use"]
timestamps = ["2024-12-01 08:30:00", "2024-12-02 14:45:00", "2024-12-03 20:15:00", "2024-12-04 10:00:00"]

# Generate enhanced scenarios
data = []
for i in range(500000):
    scenario = {
        "name": names[i % len(names)],
        "location": locations[i % len(locations)],
        "cylinder_size": cylinder_sizes[i % len(cylinder_sizes)],
        "refill_quantity": refill_quantities[i % len(refill_quantities)],
        "meal_type": meal_types[i % len(meal_types)],
        "household_size": household_sizes[i % len(household_sizes)],
        "price_per_kg": price_per_kg[i % len(price_per_kg)],
        "refill_frequency": refill_frequencies[i % len(refill_frequencies)],
        "distance_to_refill": distances_to_refill[i % len(distances_to_refill)],
        "weather_condition": weather_conditions[i % len(weather_conditions)],
        "special_event": special_events[i % len(special_events)],
        "stove_type": stove_types[i % len(stove_types)],
        "depletion_case": depletion_cases[i % len(depletion_cases)],
        "shared_cylinder_case": shared_cylinder_cases[i % len(shared_cylinder_cases)],
        "timestamp": timestamps[i % len(timestamps)]
    }
    data.append(scenario)

# Convert the data into a DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp
0,Abdulbasit,Lagos,5kg,4kg,light meals,single-person home,₦800,weekly,1km,sunny,wedding,single-burner,running out of gas,shared with neighbors,2024-12-01 08:30:00
1,Aisha,Abuja,6kg,5kg,heavy meals,family of 2,₦850,bi-weekly,2.5km,rainy,birthday party,double-burner,refilling on time,used by multiple households,2024-12-02 14:45:00
2,Ahmed,Kano,8kg,7kg,a mix of both,family of 3,₦900,monthly,3km,dry season,Christmas,electric stove,delayed refills,individual use,2024-12-03 20:15:00
3,Joy,Onitsha,10kg,8kg,light meals,family of 4,₦950,weekly,5km,harmattan,Ramadan celebration,gas cooker with oven,running out of gas,shared with neighbors,2024-12-04 10:00:00
4,Ngozi,Kaduna,12.5kg,12kg,heavy meals,family of 5,₦1000,bi-weekly,1km,sunny,No special event,single-burner,refilling on time,used by multiple households,2024-12-01 08:30:00


In [3]:
df.shape

(500000, 15)

In [24]:
#save data to csv file
df.to_csv("Kike_AI_Data.csv", index=False)

## PROJECT OBJECTIVES

1. To predict the level of PNG remaining in the gas cylinder after a certain period of time
2. To predict the likely time the gas will finish (and to be able to send reminders)

### Data Preprocessing

In [4]:
data = pd.read_csv('Kike_AI_Data.csv')
data.head()

Unnamed: 0,name,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp
0,Abdulbasit,Lagos,5kg,4kg,light meals,single-person home,₦800,weekly,1km,sunny,wedding,single-burner,running out of gas,shared with neighbors,2024-12-01 08:30:00
1,Aisha,Abuja,6kg,5kg,heavy meals,family of 2,₦850,bi-weekly,2.5km,rainy,birthday party,double-burner,refilling on time,used by multiple households,2024-12-02 14:45:00
2,Ahmed,Kano,8kg,7kg,a mix of both,family of 3,₦900,monthly,3km,dry season,Christmas,electric stove,delayed refills,individual use,2024-12-03 20:15:00
3,Joy,Onitsha,10kg,8kg,light meals,family of 4,₦950,weekly,5km,harmattan,Ramadan celebration,gas cooker with oven,running out of gas,shared with neighbors,2024-12-04 10:00:00
4,Ngozi,Kaduna,12.5kg,12kg,heavy meals,family of 5,₦1000,bi-weekly,1km,sunny,No special event,single-burner,refilling on time,used by multiple households,2024-12-01 08:30:00


In [5]:
# Preprocessing
data['cylinder_size'] = data['cylinder_size'].str.replace('kg', '').astype(float)
data['refill_quantity'] = data['refill_quantity'].str.replace('kg', '').astype(float)
data['price_per_kg'] = data['price_per_kg'].str.replace('₦', '').astype(float)
data['distance_to_refill'] = data['distance_to_refill'].str.replace('km', '').astype(float)


In [6]:
data.head()

Unnamed: 0,name,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp
0,Abdulbasit,Lagos,5.0,4.0,light meals,single-person home,800.0,weekly,1.0,sunny,wedding,single-burner,running out of gas,shared with neighbors,2024-12-01 08:30:00
1,Aisha,Abuja,6.0,5.0,heavy meals,family of 2,850.0,bi-weekly,2.5,rainy,birthday party,double-burner,refilling on time,used by multiple households,2024-12-02 14:45:00
2,Ahmed,Kano,8.0,7.0,a mix of both,family of 3,900.0,monthly,3.0,dry season,Christmas,electric stove,delayed refills,individual use,2024-12-03 20:15:00
3,Joy,Onitsha,10.0,8.0,light meals,family of 4,950.0,weekly,5.0,harmattan,Ramadan celebration,gas cooker with oven,running out of gas,shared with neighbors,2024-12-04 10:00:00
4,Ngozi,Kaduna,12.5,12.0,heavy meals,family of 5,1000.0,bi-weekly,1.0,sunny,No special event,single-burner,refilling on time,used by multiple households,2024-12-01 08:30:00


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 15 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   name                  500000 non-null  object 
 1   location              500000 non-null  object 
 2   cylinder_size         500000 non-null  float64
 3   refill_quantity       500000 non-null  float64
 4   meal_type             500000 non-null  object 
 5   household_size        500000 non-null  object 
 6   price_per_kg          500000 non-null  float64
 7   refill_frequency      500000 non-null  object 
 8   distance_to_refill    500000 non-null  float64
 9   weather_condition     500000 non-null  object 
 10  special_event         500000 non-null  object 
 11  stove_type            500000 non-null  object 
 12  depletion_case        500000 non-null  object 
 13  shared_cylinder_case  500000 non-null  object 
 14  timestamp             500000 non-null  object 
dtype

### Encoding Categorical variables

In [8]:
encoder = LabelEncoder()
for col in ['meal_type', 'household_size', 'weather_condition', 'special_event', 'stove_type', 'depletion_case', 'shared_cylinder_case']:
    data[col] = encoder.fit_transform(data[col])

In [9]:
data['refill_frequency'] = data['refill_frequency'].map({'weekly': 7, 'bi-weekly': 14, 'monthly': 30})

In [10]:
data.head()

Unnamed: 0,name,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp
0,Abdulbasit,Lagos,5.0,4.0,2,5,800.0,7,1.0,3,4,3,2,1,2024-12-01 08:30:00
1,Aisha,Abuja,6.0,5.0,1,0,850.0,14,2.5,2,3,0,1,2,2024-12-02 14:45:00
2,Ahmed,Kano,8.0,7.0,0,1,900.0,30,3.0,0,0,1,0,0,2024-12-03 20:15:00
3,Joy,Onitsha,10.0,8.0,2,2,950.0,7,5.0,1,2,2,2,1,2024-12-04 10:00:00
4,Ngozi,Kaduna,12.5,12.0,1,3,1000.0,14,1.0,3,1,3,1,2,2024-12-01 08:30:00


### Feature Engineering

Creating new columns that are important for prediction

In [11]:
data['daily_consumption'] = data['refill_quantity'] / data['refill_frequency']

In [12]:
data['cylinder_size'] - data['daily_consumption']

0          4.428571
1          5.642857
2          7.766667
3          8.857143
4         11.642857
            ...    
499995     4.428571
499996     5.642857
499997     7.766667
499998     8.857143
499999    11.642857
Length: 500000, dtype: float64

In [13]:
# Target Variables
data['remaining_gas'] = data['cylinder_size'] - data['daily_consumption']
data['remaining_gas_percentage'] = (data['remaining_gas'] / data['cylinder_size']) * 100
data['days_to_empty'] = data['remaining_gas'] / data['daily_consumption']


In [14]:
data.head()

Unnamed: 0,name,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp,daily_consumption,remaining_gas,remaining_gas_percentage,days_to_empty
0,Abdulbasit,Lagos,5.0,4.0,2,5,800.0,7,1.0,3,4,3,2,1,2024-12-01 08:30:00,0.571429,4.428571,88.571429,7.75
1,Aisha,Abuja,6.0,5.0,1,0,850.0,14,2.5,2,3,0,1,2,2024-12-02 14:45:00,0.357143,5.642857,94.047619,15.8
2,Ahmed,Kano,8.0,7.0,0,1,900.0,30,3.0,0,0,1,0,0,2024-12-03 20:15:00,0.233333,7.766667,97.083333,33.285714
3,Joy,Onitsha,10.0,8.0,2,2,950.0,7,5.0,1,2,2,2,1,2024-12-04 10:00:00,1.142857,8.857143,88.571429,7.75
4,Ngozi,Kaduna,12.5,12.0,1,3,1000.0,14,1.0,3,1,3,1,2,2024-12-01 08:30:00,0.857143,11.642857,93.142857,13.583333


In [15]:
data.drop('name', axis=1, inplace=True)

In [16]:
data.head()

Unnamed: 0,location,cylinder_size,refill_quantity,meal_type,household_size,price_per_kg,refill_frequency,distance_to_refill,weather_condition,special_event,stove_type,depletion_case,shared_cylinder_case,timestamp,daily_consumption,remaining_gas,remaining_gas_percentage,days_to_empty
0,Lagos,5.0,4.0,2,5,800.0,7,1.0,3,4,3,2,1,2024-12-01 08:30:00,0.571429,4.428571,88.571429,7.75
1,Abuja,6.0,5.0,1,0,850.0,14,2.5,2,3,0,1,2,2024-12-02 14:45:00,0.357143,5.642857,94.047619,15.8
2,Kano,8.0,7.0,0,1,900.0,30,3.0,0,0,1,0,0,2024-12-03 20:15:00,0.233333,7.766667,97.083333,33.285714
3,Onitsha,10.0,8.0,2,2,950.0,7,5.0,1,2,2,2,1,2024-12-04 10:00:00,1.142857,8.857143,88.571429,7.75
4,Kaduna,12.5,12.0,1,3,1000.0,14,1.0,3,1,3,1,2,2024-12-01 08:30:00,0.857143,11.642857,93.142857,13.583333


### Model Building

In [17]:
# Splitting data
X = data[['cylinder_size', 'refill_quantity', 'meal_type', 'household_size', 'daily_consumption','special_event','stove_type','shared_cylinder_case']]
y_gas_level = data['remaining_gas']
y_gas_percentage = data['remaining_gas_percentage']
y_days_to_empty = data['days_to_empty']

X_train, X_test, y_train_gas, y_test_gas = train_test_split(X, y_gas_level, test_size=0.2, random_state=42)
X_train, X_test, y_train_percentage, y_test_percentage = train_test_split(X, y_gas_percentage, test_size=0.2, random_state=42)
X_train, X_test, y_train_days, y_test_days = train_test_split(X, y_days_to_empty, test_size=0.2, random_state=42)

In [18]:
# Model 1: Predict Remaining Gas
model_gas = RandomForestRegressor()
model_gas.fit(X_train, y_train_gas)
pred_gas = model_gas.predict(X_test)
print("Remaining Gas Level MAE:", mean_absolute_error(y_test_gas, pred_gas))

# Model 2: Predict Remaining Gas Percentage
model_percentage = RandomForestRegressor()
model_percentage.fit(X_train, y_train_percentage)
pred_percentage = model_percentage.predict(X_test)
print("Remaining Gas Percentage MAE:", mean_absolute_error(y_test_percentage, pred_percentage))

# Model 3: Predict Days to Depletion
model_days = RandomForestRegressor()
model_days.fit(X_train, y_train_days)
pred_days = model_days.predict(X_test)
print("Days to Depletion MAE:", mean_absolute_error(y_test_days, pred_days))

Remaining Gas Level MAE: 3.751816013419784e-13
Remaining Gas Percentage MAE: 3.07029822010918e-12
Days to Depletion MAE: 2.788724184910052e-13


In [None]:
# prediction

### Performance Evaluation

In [19]:
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np

# Function to display feature importance
def display_feature_importance(model, feature_names, model_name):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)
    print(f"\nFeature Importance for {model_name}:")
    print(feature_importance)
    return feature_importance

# Perform Cross-Validation and Feature Importance for each model
#feature_names = ['cylinder_size', 'refill_quantity', 'meal_type', 'household_size', 'daily_consumption']
feature_names = ['cylinder_size', 'refill_quantity', 'meal_type', 'household_size', 'daily_consumption','special_event','stove_type','shared_cylinder_case']

# Cross-validation for Remaining Gas Level Model
cv_gas = cross_val_score(model_gas, X, y_gas_level, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE for Remaining Gas Level: {np.mean(-cv_gas):.4f}")

# Feature importance for Remaining Gas Level
gas_feature_importance = display_feature_importance(model_gas, feature_names, "Remaining Gas Level")

# Cross-validation for Remaining Gas Percentage Model
cv_percentage = cross_val_score(model_percentage, X, y_gas_percentage, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE for Remaining Gas Percentage: {np.mean(-cv_percentage):.4f}")

# Feature importance for Remaining Gas Percentage
percentage_feature_importance = display_feature_importance(model_percentage, feature_names, "Remaining Gas Percentage")

# Cross-validation for Days to Depletion Model
cv_days = cross_val_score(model_days, X, y_days_to_empty, cv=5, scoring='neg_mean_absolute_error')
print(f"Cross-Validation MAE for Days to Depletion: {np.mean(-cv_days):.4f}")

# Feature importance for Days to Depletion
days_feature_importance = display_feature_importance(model_days, feature_names, "Days to Depletion")

# Optionally save feature importance to CSV
#gas_feature_importance.to_csv("Gas_Level_Feature_Importance.csv", index=False)
#percentage_feature_importance.to_csv("Gas_Percentage_Feature_Importance.csv", index=False)
#days_feature_importance.to_csv("Days_to_Depletion_Feature_Importance.csv", index=False)

#print("\nFeature importance results saved to CSV files.")


Cross-Validation MAE for Remaining Gas Level: 0.0000

Feature Importance for Remaining Gas Level:
                Feature    Importance
0         cylinder_size  4.802493e-01
1       refill_quantity  4.221742e-01
5         special_event  7.811570e-02
2             meal_type  9.766602e-03
4     daily_consumption  9.104784e-03
7  shared_cylinder_case  5.893978e-04
6            stove_type  3.081740e-12
3        household_size  1.117970e-12
Cross-Validation MAE for Remaining Gas Percentage: 0.0000

Feature Importance for Remaining Gas Percentage:
                Feature    Importance
2             meal_type  9.205387e-01
7  shared_cylinder_case  5.865954e-02
0         cylinder_size  6.303604e-03
4     daily_consumption  6.054785e-03
1       refill_quantity  4.427432e-03
5         special_event  4.015979e-03
6            stove_type  2.985446e-10
3        household_size  1.391743e-10
Cross-Validation MAE for Days to Depletion: 0.0000

Feature Importance for Days to Depletion:
                

## Pickling the models

In [20]:
import pickle

# Save the models to pickle files
with open("Remaining_Gas_Model.pkl", "wb") as gas_model_file:
    pickle.dump(model_gas, gas_model_file)

with open("Remaining_Gas_Percentage_Model.pkl", "wb") as percentage_model_file:
    pickle.dump(model_percentage, percentage_model_file)

with open("Days_to_Depletion_Model.pkl", "wb") as days_model_file:
    pickle.dump(model_days, days_model_file)

print("Models successfully saved to pickle files.")


Models successfully saved to pickle files.
