In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

# Step 1: Load the dataset
file_path = 'Final_Dataset.csv'  # Update this path as per your file location
data = pd.read_csv("C:/Users/LENOVO/Desktop/Final_Dataset.csv", encoding='latin1')






















































































































# Assume the last column is the target variable; adjust if needed
target_column = data.columns[-1]

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Step 2: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Preprocessing
preprocessing_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),  # Handle missing values
    ("scaler", StandardScaler())                 # Standardize the data
])

X_train = preprocessing_pipeline.fit_transform(X_train)
X_test = preprocessing_pipeline.transform(X_test)

# Step 4: Define Ensemble Models
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)

ensemble_model = VotingClassifier(
    estimators=[('rf', rf_model), ('gb', gb_model)],
    voting='soft'
)

# Step 5: Train the Model
ensemble_model.fit(X_train, y_train)

# Step 6: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 7: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Step 8: Feature Importance (Random Forest only, for interpretability)
try:
    feature_importances = rf_model.feature_importances_
    feature_names = data.drop(columns=[target_column]).columns

    plt.figure(figsize=(10, 6))
    plt.barh(feature_names, feature_importances, color='skyblue')
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.title('Feature Importance - Random Forest')
    plt.show()
except AttributeError:
    print("Feature importances are not available for this model.")

# Step 9: ROC Curve (only for binary classification)
if len(np.unique(y)) == 2:
    y_prob = ensemble_model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()
else:
    print("ROC Curve is not applicable for multiclass classification.")


In [None]:
# Step 1: Load the dataset
import pandas as pd

file_path = "C:/Users/LENOVO/Desktop/bank.csv"  # Path to your dataset
data = pd.read_csv(file_path)

# Step 2: Inspect columns (already done)
print(data.columns)

# Step 3: Data Preprocessing
target_column = 'deposit'  # The target variable column name
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Handling missing data (if any)
from sklearn.impute import SimpleImputer

# Imputing missing values with the most frequent value for categorical columns and mean for numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Imputation for numerical features (mean)
numerical_imputer = SimpleImputer(strategy='mean')
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])

# Imputation for categorical features (most frequent)
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# Step 4: Feature Scaling and Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression  # Correct import for LogisticRegression
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor for scaling numerical features and encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Step 5: Build and Train Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression())
    ], voting='hard'))
])

# Step 6: Train the model
ensemble_model.fit(X_train, y_train)

# Step 7: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

# Accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Load dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"
data = pd.read_csv(file_path)

# Step 3: Data Preprocessing
target_column = 'deposit'  # Adjusted target column based on the dataset
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Handling missing data (if any)
X = X.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
X = X.fillna(X.mean())  # For numerical columns

# Convert categorical variables into a suitable format using OneHotEncoder
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Step 5: Build and Train Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations
    ], voting='hard'))
])

# Step 6: Train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
ensemble_model.fit(X_train, y_train)

# Step 7: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve
y_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]  # Probabilities for the 'yes' class
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Step 2: Check for missing values or any other issues
print("Missing values per column:\n", data.isnull().sum())

# Clean the data (strip whitespace from string columns and handle missing data)
data = data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
data.fillna(data.mean(), inplace=True)  # For numerical columns

# Step 3: Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 4: Build the Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations for LR
    ], voting='hard'))
])

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve and AUC
y_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]  # Probabilities for the 'yes' class
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Step 2: Check for missing values or any other issues
print("Missing values per column:\n", data.isnull().sum())

# Step 3: Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 4: Build the Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations for LR
    ], voting='hard'))
])

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve and AUC
y_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]  # Probabilities for the 'yes' class
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Step 2: Check for missing values or any other issues
print("Missing values per column:\n", data.isnull().sum())

# Step 3: Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 4: Build the Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations for LR
    ], voting='soft'))  # Use soft voting
])

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve and AUC
y_pred_prob = ensemble_model.predict_proba(X_test)[:, 1]  # Probabilities for the 'yes' class
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Step 2: Check for missing values or any other issues
print("Missing values per column:\n", data.isnull().sum())

# Step 3: Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 4: Build the Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations for LR
    ], voting='soft'))  # Use soft voting
])

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 7: Model Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
# Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Step 2: Check for missing values or any other issues
missing_values = data.isnull().sum()
print("Missing values per column before handling:\n", missing_values)

# Print columns that had missing values
columns_with_missing = missing_values[missing_values > 0]
if not columns_with_missing.empty:
    print("\nColumns with missing values before setting to zero:")
    print(columns_with_missing)

# Step 3: Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Step 4: Handle missing values (set them to zero)
data.fillna(0, inplace=True)

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Step 5: Build the Ensemble Model
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))  # Increased iterations for LR
    ], voting='soft'))  # Use soft voting
])

# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 8: Model Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy and Classification Report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.utils import resample

# Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/bank.csv"  # Adjust this path
data = pd.read_csv(file_path)

# Check for missing values
print("Missing values per column:\n", data.isnull().sum())

# Preprocessing
target_column = 'deposit'  # This is the column to predict

# Features and target variable
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[('num', StandardScaler(), numerical_features),
                  ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Ensemble Model Setup
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42, class_weight='balanced')),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression(max_iter=300, random_state=42))], voting='soft'))
])

# Step 1: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Hyperparameter Tuning using GridSearchCV
param_grid = {
    'classifier__rf__n_estimators': [100, 200],
    'classifier__rf__max_depth': [10, 20, None],
    'classifier__gb__learning_rate': [0.01, 0.1],
    'classifier__gb__n_estimators': [100, 200],
    'classifier__lr__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(ensemble_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
print("Best parameters from GridSearchCV:\n", grid_search.best_params_)

# Step 3: Model Evaluation
y_pred = grid_search.best_estimator_.predict(X_test)

# Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# ROC Curve and AUC
y_pred_prob = grid_search.best_estimator_.predict_proba(X_test)[:, 1]  # Probabilities for the 'yes' class
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Cross-Validation Score
cv_score = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print("Cross-validation Accuracy:", cv_score.mean())


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
)

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/Final_Dataset.csv"  # Update this path as per your file location
data = pd.read_csv(file_path)

# Assume the last column is the target variable; adjust if needed
target_column = data.columns[-1]

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Step 2: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Preprocessing for numeric and categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Step 4: Define Models
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)

ensemble_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", VotingClassifier(
        estimators=[("rf", rf_model), ("gb", gb_model)],
        voting="soft"
    ))
])

# Step 5: Train the Model
ensemble_model.fit(X_train, y_train)

# Step 6: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Feature Importance (Random Forest only, for interpretability)
rf_model.fit(preprocessor.fit_transform(X_train), y_train)
feature_importances = rf_model.feature_importances_
feature_names = (
    list(numeric_features) + 
    list(preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_features))
)

plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("Feature Importance - Random Forest")
plt.show()

# ROC Curve (only for binary classification)
if len(np.unique(y)) == 2:
    y_prob = ensemble_model.named_steps["classifier"].predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()
else:
    print("ROC Curve is not applicable for multiclass classification.")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
)

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/Final_Dataset.csv"  # Update this path as per your file location

# Attempt to read the file with specified encoding
try:
    data = pd.read_csv(file_path, encoding="latin1")  # Adjust encoding if needed
except UnicodeDecodeError:
    print("UnicodeDecodeError encountered. Trying another encoding...")
    data = pd.read_csv(file_path, encoding="utf-8", errors="replace")  # Fallback if latin1 doesn't work

# Assume the last column is the target variable; adjust if needed
target_column = data.columns[-1]

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Step 2: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Preprocessing for numeric and categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Step 4: Define Models
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)

ensemble_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", VotingClassifier(
        estimators=[("rf", rf_model), ("gb", gb_model)],
        voting="soft"
    ))
])

# Step 5: Train the Model
ensemble_model.fit(X_train, y_train)

# Step 6: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Feature Importance (Random Forest only, for interpretability)
rf_model.fit(preprocessor.fit_transform(X_train), y_train)
feature_importances = rf_model.feature_importances_
feature_names = (
    list(numeric_features) + 
    list(preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_features))
)

plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("Feature Importance - Random Forest")
plt.show()

# ROC Curve (only for binary classification)
if len(np.unique(y)) == 2:
    y_prob = ensemble_model.named_steps["classifier"].predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()
else:
    print("ROC Curve is not applicable for multiclass classification.")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    auc,
)

# Step 1: Load the dataset
file_path = r"C:/Users/LENOVO/Desktop/Final_Dataset.csv"  # Update this path as per your file location
data = pd.read_csv(file_path)

# Assume the last column is the target variable; adjust if needed
target_column = data.columns[-1]

X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Step 2: Check if the target variable is categorical (for classification)
# Convert to categorical if it's not (e.g., by binning continuous values)
if y.dtype != "object":
    print("Target is continuous, binning into categories.")
    y = pd.cut(y, bins=3, labels=['Low', 'Medium', 'High'])

# Step 3: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Preprocessing for numeric and categorical features
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean")),  # Handle missing values
            ("scaler", StandardScaler())  # Standardize the data
        ]), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Step 5: Define Models for Classification
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100, learning_rate=0.1)

ensemble_model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", VotingClassifier(
        estimators=[("rf", rf_model), ("gb", gb_model)],
        voting="soft"
    ))
])

# Step 6: Train the Model
ensemble_model.fit(X_train, y_train)

# Step 7: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

# Feature Importance (Random Forest only, for interpretability)
rf_model.fit(preprocessor.fit_transform(X_train), y_train)
feature_importances = rf_model.feature_importances_
feature_names = (
    list(numeric_features) +
    list(preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_features))
)

plt.figure(figsize=(10, 6))
plt.barh(feature_names, feature_importances, color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Features")
plt.title("Feature Importance - Random Forest")
plt.show()

# ROC Curve (only for binary classification)
if len(np.unique(y)) == 2:
    y_prob = ensemble_model.named_steps["classifier"].predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (area = {roc_auc:.2f})")
    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")
    plt.show()
else:
    print("ROC Curve is not applicable for multiclass classification.")


In [None]:
import pandas as pd

# Path to the dataset
data_path = "C:/Users/LENOVO/Desktop/bank.csv"

# Load the dataset
data = pd.read_csv(data_path)

# Inspect the dataset
print("Dataset Shape:", data.shape)
print("Dataset Columns:", data.columns)
print("First few rows of the dataset:")
print(data.head())


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

# Step 1: Load the dataset
file_path = r"C:\Users\LENOVO\Desktop\bank.csv"  # Update this path with your actual file location
data = pd.read_csv(file_path)

# Step 2: Inspect the dataset
print("Dataset Shape:", data.shape)
print("Dataset Columns:", data.columns)
print("First few rows of the dataset:")
print(data.head())

# Step 3: Data Preprocessing
# Assume the target column is 'y', which is the subscription to the term deposit (yes/no)
target_column = 'y'
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Handling missing data (if any)
# Imputing missing values using SimpleImputer
imputer = SimpleImputer(strategy="most_frequent")  # You can change strategy if needed
X_imputed = imputer.fit_transform(X)

# Step 4: Encode categorical features and scale numerical features
# Define the numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing for numerical data (imputation + scaling)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputation
    ('scaler', StandardScaler())  # Standardization (scaling)
])

# Preprocessing for categorical data (imputation + one-hot encoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputation
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Combine preprocessing for both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.3, random_state=42)

# Step 6: Create an Ensemble Model (Voting Classifier)
# Create the classifiers for the ensemble
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
gb_classifier = GradientBoostingClassifier(random_state=42)

# Create the ensemble classifier using Voting Classifier
ensemble_model = Pipeline([
    ('preprocessor', preprocessor),  # Apply preprocessing to features
    ('classifier', VotingClassifier(
        estimators=[('rf', rf_classifier), ('gb', gb_classifier)], voting='hard'))
])

# Step 7: Train the Ensemble Model
ensemble_model.fit(X_train, y_train)

# Step 8: Make Predictions
y_pred = ensemble_model.predict(X_test)

# Step 9: Evaluate the Model
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Optional: Hyperparameter tuning or feature selection can be added to improve performance


In [None]:
# Print column names to inspect
print(data.columns)


In [None]:
# Step 1: Load the dataset
import pandas as pd

file_path = "C:/Users/LENOVO/Desktop/bank.csv"  # Path to your dataset
data = pd.read_csv(file_path)

# Step 2: Inspect columns (already done)
print(data.columns)

# Step 3: Data Preprocessing
target_column = 'deposit'  # The target variable column name
X = data.drop(columns=[target_column])  # Features
y = data[target_column]  # Target variable

# Handling missing data (if any)
from sklearn.impute import SimpleImputer

# Imputing missing values with the most frequent value for categorical columns and mean for numerical columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Imputation for numerical features (mean)
numerical_imputer = SimpleImputer(strategy='mean')
X[numerical_features] = numerical_imputer.fit_transform(X[numerical_features])

# Imputation for categorical features (most frequent)
categorical_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = categorical_imputer.fit_transform(X[categorical_features])

# Step 4: Feature Scaling and Encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessor for scaling numerical features and encoding categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Step 5: Build and Train Ensemble Model
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, LogisticRegression

ensemble_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('lr', LogisticRegression())
    ], voting='hard'))
])

# Step 6: Train the model
ensemble_model.fit(X_train, y_train)

# Step 7: Predictions and Evaluation
y_pred = ensemble_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

# Accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:























































































































































































































































































# Print column names to inspect
print(data.columns)
