# Packages

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_theme(style="white", context="notebook", palette="deep")
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate, KFold

# Functions

In [7]:
def random_SCV(pipe = [],
               grid_param = [],
               n_iter = 10,
               cv = 5,
               scoring = "neg_mean_squared_error",
               rnd_state = 42,
               file_name = "",
               training = []):
    rnd_search = RandomizedSearchCV(pipe, param_distributions=grid_param, n_iter=n_iter, cv=cv, scoring=scoring, verbose=100000)
    rnd_search.fit(training[0], training[1])
    wr_pkl_file(file_name, rnd_search, False)

def grid_SCV(pipe = [], grid_param = [], cv = 5, scoring = "neg_mean_squared_error", file_name = "", training = []):
    rnd_search = GridSearchCV(pipe, grid_param, cv=cv, scoring=scoring, verbose=100)
    rnd_search.fit(training[0], training[1])
    wr_pkl_file(file_name, rnd_search, False)

def wr_pkl_file(file_name = "", content = "", read = False):
    if file_name:
        path_ = "./param_tuning/" + file_name
        if read:
            return joblib.load(path_)
        else:
            if content:
                joblib.dump(content, path_)

def print_results(labels = [], est = [], plt_num = 50, testing = []):
    col = 1
    plt.rcParams['figure.figsize'] = (30,15)
    for name, clf in zip(labels,est):
        print(name)

        df_labels_prediction = clf.predict(testing[0])

        print("R-Squared: {:.3f}".format(r2_score(np.exp(testing[1]), np.exp(df_labels_prediction))))
        mse = mean_squared_error(np.exp(testing[1]), np.exp(df_labels_prediction))
        mae = mean_absolute_error(np.exp(testing[1]), np.exp(df_labels_prediction))

        rmse = np.sqrt(mse)
        print("Root Mean Suqre Error: {:,.3f}".format(rmse))
        print("Mean Absolute Error: {:,.3f}".format(mae))
        print()

        plt.subplot(3,1,col)

        plt.title(name.replace(":", ""), fontsize= 18)
        plt.xticks(fontsize = 18)
        plt.yticks(fontsize = 18)
        plt.plot(range(plt_num), np.exp(df_labels_prediction)[:plt_num], "*-", label="model prediction")
        plt.plot(range(plt_num), np.exp(testing[1])[:plt_num], "o--", label="true value")
        plt.legend()
        col += 1

def validation(models = [], estimators = [], training = [], cv = 5, train_score = False):
    if len(models) != len(estimators):
        print("Error: Model Names And Estimator Must Have The Same Length")
        return
    
    for model, estimator in zip(models, estimators):
        scores = cross_validate(estimator, training[0], training[1], cv=cv, scoring=("r2", "neg_mean_squared_error", "neg_mean_absolute_error"), return_train_score=train_score)
    
        print(model)
        print("R-Squared: {:,.3f}".format(np.mean(scores["test_r2"])))
        print("Root Mean Squared Error: {:,.3f}".format(np.mean(np.sqrt(-scores["test_neg_mean_squared_error"]))))
        print("Mean Absolute Error: {:,.3f}".format(np.mean(-scores)["test_neg_mean_absolute_error"]))
        print()

# Data Cleaning

In [12]:
url = "https://drive.google.com/file/d/1Q3_zadThHAeiEJNU-ieZJy3oIFs5QXdp/view?usp=sharing"
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]

df  = pd.read_csv(path)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41544 entries, 0 to 41543
Data columns (total 50 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Total Household Income                   41544 non-null  int64 
 1   Region                                   41544 non-null  object
 2   Staple Food Expenditure                  41544 non-null  int64 
 3   Source of Income                         41544 non-null  object
 4   Agricultural Household indicator         41544 non-null  int64 
 5   Meat Expenditure                         41544 non-null  int64 
 6   Seafood Expenditure                      41544 non-null  int64 
 7   Leisure Expenditure                      41544 non-null  int64 
 8   Alcohol Expenditure                      41544 non-null  int64 
 9   Tobacco Expenditure                      41544 non-null  int64 
 10  Medical Expenditure                      41544 non-null  i

In [13]:
df.head(10)

Unnamed: 0,Total Household Income,Region,Staple Food Expenditure,Source of Income,Agricultural Household indicator,Meat Expenditure,Seafood Expenditure,Leisure Expenditure,Alcohol Expenditure,Tobacco Expenditure,...,Number of Refrigerator/Freezer,Number of Washing Machine,Number of Airconditioner,"Number of Car, Jeep, Van",Number of Landline/wireless telephones,Number of Cellular phone,Number of Personal Computer,Number of Stove with Oven/Gas Range,Number of Motorized Banca,Number of Motorcycle/Tricycle
0,9370,CAR,1736,Wage,0,441,300,129,0,0,...,1,1,0,0,0,2,1,0,0,1
1,3540,CAR,718,Wage,0,311,198,27,17,38,...,0,1,0,0,0,3,1,0,0,2
2,1478,CAR,1281,Wage,1,139,46,9,5,81,...,0,0,0,0,0,0,0,0,0,0
3,2199,CAR,1273,Wage,0,195,193,9,9,0,...,0,0,0,0,0,1,0,0,0,0
4,4732,CAR,1386,Wage,0,328,202,18,19,0,...,1,0,0,0,0,3,0,0,0,1
5,2730,CAR,1177,Wage,0,274,153,11,3,4,...,0,1,0,0,0,4,0,0,0,1
6,4838,CAR,1705,Wage,0,232,220,114,34,0,...,0,0,0,0,0,2,0,0,0,1
7,3336,CAR,1414,Other sources of Income,1,261,284,27,9,0,...,0,1,0,0,0,2,0,0,0,1
8,3056,CAR,814,Other sources of Income,0,205,119,9,0,0,...,0,0,0,0,0,2,0,0,0,0
9,16763,CAR,2044,Enterpreneurial Activities,0,473,459,71,0,0,...,1,0,0,1,0,4,1,0,0,0


In [19]:
target = "Total Household Income"
features = [
    'Household Head Gender', 'Household Head Age',
    'Household Head Marital Status', 'Household Head Highest Grade Completed',
    'Household Head Occupation', 'Household Head Class of Worker',
    'House Floor Area', 'Tenure Status', 'Staple Food Expenditure', 
    'Meat Expenditure', 'Seafood Expenditure', 'Medical Expenditure', 
    'Transportation Expenditure', 'Utilities Expenditure', 'Education Expenditure', 
    'Number of Car, Jeep, Van', 'Number of Personal Computer'
]

x = df[features]
y = df[target]

numerical_features = x.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = x.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model.fit(x_train, y_train)

# Get feature importances from the model
importances = model.named_steps['regressor'].feature_importances_
feature_names = numerical_features + list(model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features))
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

# Print the top 10 most important features
print(feature_importances.head(10))

# Evaluate the model
y_pred = model.predict(x_test)
print("Model score:", model.score(x_test, y_test))

Utilities Expenditure                                                              0.325462
Number of Car, Jeep, Van                                                           0.132219
Transportation Expenditure                                                         0.088139
Seafood Expenditure                                                                0.068062
Medical Expenditure                                                                0.051709
Staple Food Expenditure                                                            0.051085
Meat Expenditure                                                                   0.037360
Education Expenditure                                                              0.036943
Household Head Occupation_Street ambulant vendors                                  0.025072
Household Head Class of Worker_Employer in own family-operated farm or business    0.020256
dtype: float64
Model score: 0.5794407934904326


In [None]:
# Define the parameter grid
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}

# Create a grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(x_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate the model with best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_train)
print("Model score with best parameters:", best_model.score(x_train, y_test))