In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters: {'max_depth': None, 'n_estimators': 300}
Accuracy: 0.9045813586097946
Confusion matrix:
 [[5432  167]
 [ 437  294]]


In [2]:
# Load the data
train_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_test.csv')


In [3]:
# Preprocessing
train_data.drop(['ID'], axis=1, inplace=True) # Remove ID column
train_data['subscribed'] = train_data['subscribed'].map({'yes': 1, 'no': 0}) # Convert target variable to binary

In [4]:
X = train_data.drop(['subscribed'], axis=1) # Features
y = train_data['subscribed'] # Target variable

In [5]:
X = pd.get_dummies(X, drop_first=True) # One-hot encoding for categorical features
scaler = StandardScaler() # Standardize numerical features

In [6]:
num_cols = ['age', 'duration', 'campaign', 'pdays', 'previous']
X[num_cols] = scaler.fit_transform(X[num_cols])


In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Train the model
rf = RandomForestClassifier(random_state=42)
params = {'n_estimators': [100, 200, 300], 'max_depth': [5, 10, 20, None]}

In [9]:
grid_search = GridSearchCV(rf, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [10]:
rf_best = grid_search.best_estimator_
print('Best parameters:', grid_search.best_params_)


Best parameters: {'max_depth': None, 'n_estimators': 300}


In [11]:
# Evaluate the model
y_pred = rf_best.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.9045813586097946


In [12]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:\n', cm)


Confusion matrix:
 [[5432  167]
 [ 437  294]]


In [13]:
# Predict on new data
test_data.drop(['ID'], axis=1, inplace=True)
test_data = pd.get_dummies(test_data, drop_first=True)
test_data[num_cols] = scaler.transform(test_data[num_cols])
test_pred = rf_best.predict(test_data)
test_pred = pd.DataFrame(test_pred, columns=['subscribed'])
test_pred.to_csv('termdeposit_predictions.csv', index=False)


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
# Load the dataset
train_data = pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_train.csv')


In [16]:
# Exploratory data analysis
print(train_data.head())


      ID  age         job   marital  education default  balance housing loan  \
0  26110   56      admin.   married    unknown      no     1933      no   no   
1  40576   31     unknown   married  secondary      no        3      no   no   
2  15320   27    services   married  secondary      no      891     yes   no   
3  43962   57  management  divorced   tertiary      no     3287      no   no   
4  29842   31  technician   married  secondary      no      119     yes   no   

     contact  day month  duration  campaign  pdays  previous poutcome  \
0  telephone   19   nov        44         2     -1         0  unknown   
1   cellular   20   jul        91         2     -1         0  unknown   
2   cellular   18   jul       240         1     -1         0  unknown   
3   cellular   22   jun       867         1     84         3  success   
4   cellular    4   feb       380         1     -1         0  unknown   

  subscribed  
0         no  
1         no  
2         no  
3        yes  
4    

In [17]:
# Check for missing values
print(train_data.isnull().sum())


ID            0
age           0
job           0
marital       0
education     0
default       0
balance       0
housing       0
loan          0
contact       0
day           0
month         0
duration      0
campaign      0
pdays         0
previous      0
poutcome      0
subscribed    0
dtype: int64


In [18]:
# Encode categorical features
le = LabelEncoder()
train_data['job'] = le.fit_transform(train_data['job'])
train_data['marital'] = le.fit_transform(train_data['marital'])
train_data['education'] = le.fit_transform(train_data['education'])
train_data['default'] = le.fit_transform(train_data['default'])
train_data['housing'] = le.fit_transform(train_data['housing'])
train_data['loan'] = le.fit_transform(train_data['loan'])
train_data['contact'] = le.fit_transform(train_data['contact'])
train_data['month'] = le.fit_transform(train_data['month'])
train_data['poutcome'] = le.fit_transform(train_data['poutcome'])


In [19]:
# Split data into features and target
X = train_data.drop(['ID', 'subscribed'], axis=1)
y = train_data['subscribed']

In [20]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
# Train a decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_params = {'max_depth': [3, 5, 7, 9]}
dt_grid = GridSearchCV(dt_classifier, dt_params, cv=5)
dt_grid.fit(X_train, y_train)
print("Best parameters for decision tree:", dt_grid.best_params_)


Best parameters for decision tree: {'max_depth': 7}


In [22]:
# Train a random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_params = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7]}
rf_grid = GridSearchCV(rf_classifier, rf_params, cv=5)
rf_grid.fit(X_train, y_train)
print("Best parameters for random forest:", rf_grid.best_params_)


Best parameters for random forest: {'max_depth': 7, 'n_estimators': 200}


In [23]:
# Evaluate the models on the testing set
y_pred_dt = dt_grid.predict(X_test)
print("Decision tree classifier accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision tree classifier confusion matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Decision tree classifier classification report:\n", classification_report(y_test, y_pred_dt))


Decision tree classifier accuracy: 0.9007898894154819
Decision tree classifier confusion matrix:
 [[5392  207]
 [ 421  310]]
Decision tree classifier classification report:
               precision    recall  f1-score   support

          no       0.93      0.96      0.94      5599
         yes       0.60      0.42      0.50       731

    accuracy                           0.90      6330
   macro avg       0.76      0.69      0.72      6330
weighted avg       0.89      0.90      0.89      6330



In [24]:
y_pred_rf = rf_grid.predict(X_test)
print("Random forest classifier accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random forest classifier confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Random forest classifier classification report:\n", classification_report(y_test, y_pred_rf))


Random forest classifier accuracy: 0.9012638230647709
Random forest classifier confusion matrix:
 [[5531   68]
 [ 557  174]]
Random forest classifier classification report:
               precision    recall  f1-score   support

          no       0.91      0.99      0.95      5599
         yes       0.72      0.24      0.36       731

    accuracy                           0.90      6330
   macro avg       0.81      0.61      0.65      6330
weighted avg       0.89      0.90      0.88      6330

