# Model selection 
For the classification of 'bank deposit sales calls'the model should suit teh following characterstiscs
1-Binary Classification: Outcome call yes=deposit sales call successful, no==deposit sales call failed
2-Features 15, both categorical and numerical
3-no sequence in data, all instances are indipendent of eachother
4-a high Recall takes priority over  ‘Precision’ (True Positive as high as possible , while accepting some False Positives , which will result in high recall and a lower Precision). This is due to the nature of the classification, we want to reach as many ‘Success; outcome as possible, accepting that some will eventually be ‘Failures’ 
5-Hyperparamets should be present and tunable

# Model selected: Random forest

Data exploration and cleaning: removing and replacing NAN values and dropping one column with sparse values. Several feature-distributions are right skewed and needs log transformation before fitting a model. Outliers are rare and contains logical values and are mostly kept in the dataset. The feature ‘duration ‘of a call is dropped, since it cannot be known before making a call

Log transform improves the base modes Accuracy from 0.88 to 0.89. Log transform is kept before tuning the base model

Hyperparameter n-estimators (number of nodes in the random tree) is kept low=10, in the base model for tuning-training purposes. 

In [6]:
import pandas as pd

# Load the combined dataset
csv_path = 'data/processed/train_clean.csv'
df_train = pd.read_csv(csv_path)

# Split into features (X) and target (y)
X= df_train.drop(columns=['y'])  # Assuming the target column is named 'target'
y= df_train['y']  # Extract the target column

print("Features (X):", X.head())
print("Target (y):", y.head())

Features (X):    age          job  marital  education default  balance housing loan  \
0   41  blue-collar  married    primary      no      849     yes   no   
1   49   technician  married    primary      no     1415     yes   no   
2   42       admin.  married  secondary      no     3842      no   no   
3   37   management   single   tertiary      no     -119     yes   no   
4   56  blue-collar  married    primary      no     3498      no   no   

    contact  day_of_week month  campaign  pdays  previous  
0   unknown           15   may         1     -1         0  
1  cellular           30   jul         2     -1         0  
2  cellular           31   jul         4     -1         0  
3   unknown           11   jun        11     -1         0  
4  cellular           15   apr         2     -1         0  
Target (y): 0     no
1     no
2     no
3     no
4    yes
Name: y, dtype: object


In [7]:
print(type(X))
print(type(y))
print(y.shape)
print(X.shape)

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(34547,)
(34547, 14)


In [None]:
#Model random forest without log transformation
# Import necessary libraries
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder




# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform the target series
y = le.fit_transform(y)



# One-hot encode categorical variables (like 'island' and 'sex')
X = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")



In [None]:
#Model random forest with log transformation
# Import necessary libraries
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


# X = your_dataframe_with_features
# y = your_target_variable

# Add log transformation for  'campaign', and 'balance'
for feature in ['campaign', 'balance']:
    X[feature] = np.log1p(X[feature])  # log1p(x) = log(1 + x), safely handles zero values

# One-hot encode categorical variables 
X = pd.get_dummies(X)

# Initialize LabelEncoder for the target variable (y)
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=10, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


In [8]:
print(type(X))
print(type(y))
print(y.shape)
print(X.shape)


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(34547,)
(34547, 14)


# model tuning

Add gin.config file to the code


In [None]:
!rye install gin-config


# Hyperparameters for RandomForestClassifier
RandomForestClassifier.n_estimators = 100
RandomForestClassifier.max_depth = 10
RandomForestClassifier.min_samples_split = 2
RandomForestClassifier.min_samples_leaf = 1
RandomForestClassifier.max_features = 'auto'
RandomForestClassifier.bootstrap = True
RandomForestClassifier.random_state = 42

# Features to apply log transformation
log_transform_features = ['campaign', 'balance']


In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import gin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Enter interactive mode to allow re-registration of configurables
gin.enter_interactive_mode()

# Register RandomForestClassifier as a gin-configurable object
@gin.configurable
class ConfigurableRandomForestClassifier(RandomForestClassifier):
    pass

# Load the Gin config file
gin.parse_config_file('telemarketing.gin')

@gin.configurable
def load_random_forest_model(
    n_estimators=100, 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    max_features='sqrt',  # Updated 'auto' to 'sqrt'
    bootstrap=True, 
    random_state=42):
    """
    Configurable RandomForestClassifier using gin.
    """
    return ConfigurableRandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=random_state
    )

# Assume telemarketing dataset is already defined
# X = telemarketing_dataframe
# y = target_variable

# Add log transformation for 'campaign', and 'balance'
for feature in ['campaign', 'balance']:
    X[feature] = np.log1p(X[feature])  # log1p(x) = log(1 + x), safely handles zero values

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Initialize LabelEncoder for the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load RandomForest model with gin-configured hyperparameters
clf = load_random_forest_model()

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")




  result = getattr(ufunc, method)(*inputs, **kwargs)


Accuracy: 0.89


In [9]:
import pandas as pd
import numpy as np
import gin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Enter interactive mode to allow re-registration of configurables
gin.enter_interactive_mode()

# Register RandomForestClassifier as a gin-configurable object
@gin.configurable
class ConfigurableRandomForestClassifier(RandomForestClassifier):
    pass

# Load the Gin config file
gin.parse_config_file('telemarketing.gin')

@gin.configurable
def load_random_forest_model(
    n_estimators=100, 
    max_depth=None, 
    min_samples_split=2, 
    min_samples_leaf=1, 
    max_features='sqrt',  
    bootstrap=True, 
    random_state=42):
    return ConfigurableRandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=random_state
    )


# Step 1: Check and handle negative values before applying log transformation
X['campaign'] = X['campaign'].apply(lambda x: np.nan if x <= 0 else np.log1p(x))
X['balance'] = X['balance'].apply(lambda x: np.nan if x <= 0 else np.log1p(x))

# Step 2: Handle any remaining NaN or infinity values
# Option 1: Fill NaN values with 0 (or a specific value like the column median)
X.fillna(0, inplace=True)

# Option 2: Remove rows with NaN or infinity values (if desired)
# X.replace([np.inf, -np.inf], np.nan, inplace=True)
# X.dropna(inplace=True)

# One-hot encode categorical variables
X = pd.get_dummies(X)

# Initialize LabelEncoder for the target variable
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load RandomForest model with gin-configured hyperparameters
clf = load_random_forest_model()

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.89
