In [None]:
!pip install ucimlrepo
!pip install nltk

In [None]:
# Import necessary libraries
from ucimlrepo import fetch_ucirepo

import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import skew
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset from UCI Machine Learning Repository
# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
data = bank_marketing.data.features
data_targets = bank_marketing.data.targets
data.head()

# Exploring the data

In [None]:
data.info()

In [None]:
# metadata
print(bank_marketing.metadata)

In [None]:
# Variable information
print(bank_marketing.variables)

# Fields and Data Types

# Understanding Data

In [None]:
# Comprehensive dataset overview
def dataset_overview(df):
    print(f"\n{' Dataset Overview ':=^80}")
    print(f"Shape: {df.shape}")
    print("\nData Types:")
    print(df.dtypes)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDescriptive Statistics:")
    print(df.describe(include='all'))
    print("\nUnique Counts:")
    print(pd.DataFrame({col: df[col].nunique() for col in df.columns}, index=['Unique Count']).T)
dataset_overview(data)

In [None]:
# Display column names of the dataset
data.columns.values

# day_of_week and month **

In [None]:
# Count the ocurrences of each Country in the dataset
data.month.value_counts()

# ** married = 3 divorced and 2 single

In [None]:
data.head()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(x='day_of_week',data=data, palette='viridis')
plt.xticks(rotation=90)
plt.title("Number of Countries in the Dataset")
plt.xlabel("day_of_week")
plt.ylabel("Count")
plt.show()

# Missing values and negatives quantities

In [None]:
# Check for missing values
print(data.isnull().sum())

In [None]:
data.shape

In [None]:
data["contact"].unique()

In [None]:
# replace NaN values
# data["contact"] = data["contact"].fillna("Others")
# print(data["contact"])

In [None]:
# Job
print(data["job"].unique())

# Education
print(data["education"].unique())

# Poutcome
print(data["poutcome"].unique())

# Contact
print(data["contact"].unique())

In [None]:
def replaceNaNvaluesOther(data: pd.DataFrame, column_name: str) -> pd.Series:
    if column_name in data.columns:
        data[column_name] = data[column_name].fillna("other")
        return data[column_name]
    else:
        print(f"La columna '{column_name}' no existe en el DataFrame.")
        return pd.Series()

In [None]:
# Job
print("=".center(50,"="))
replaceNaNvaluesOther(data,"job")
print(data["job"])

# Education
print("=".center(50,"="))
replaceNaNvaluesOther(data,"education")
print(data["education"])

# Poutcome
print("=".center(50,"="))
replaceNaNvaluesOther(data,"poutcome")
print(data["poutcome"])

# Contact
print("=".center(50,"="))
replaceNaNvaluesOther(data,"contact")
print(data["contact"])

In [None]:
# Check for missing values
print(data.isnull().sum())

In [None]:
data.head()

# EDA (Exploratory Data Analysis)

In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
import numpy as np
from wordcloud import WordCloud
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def analyze_month_adapted(data, month, color, month_column='month'):
    """
    Analyzes customer data for a specific month with visualizations (excluding NLP).

    Parameters:
        data: DataFrame containing the customer dataset.
        month: Name of the month to analyze.
        color: Primary color for visualizations.
        month_column: Name of the column containing the month information.
    """
    # Filter data for the specified month
    monthly_data = data[data[month_column] == month].copy()
    if monthly_data.empty:
        print(f"No data available for the month: {month}")
        return

    # Initial setup
    background_color = "#f8fafc"
    plt.style.use('seaborn-v0_8-whitegrid')

    # Set up the figure
    fig = plt.figure(figsize=(24, 25)) # Adjusted figure height
    gs = fig.add_gridspec(5, 2) # Adjusted grid layout
    gs.update(wspace=0.5, hspace=0.8)

    # Create axes
    axes = [fig.add_subplot(gs[i, j]) for i in range(5) for j in range(2)]
    for ax in axes:
        ax.set_facecolor(background_color)

    # Main title
    axes[0].axis('off')
    axes[0].text(0.5, 0.5,
                    f'{month} Customer Analysis\n_________________________',
                    horizontalalignment='center',
                    verticalalignment='center',
                    fontsize=18, fontweight='bold',
                    fontfamily='serif',
                    color="#000000")

    # 1. Distribution of Age
    sns.histplot(ax=axes[1], data=monthly_data, x='age', bins=30, kde=True, color=color)
    axes[1].set_title(f'{month} Age Distribution', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Age')
    axes[1].set_ylabel('Number of Customers')
    axes[1].grid(True, linestyle='--', alpha=0.6)

    # 2. Marital Status Distribution
    marital_counts = monthly_data['marital'].value_counts()
    axes[2].pie(marital_counts, labels=marital_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette(n_colors=len(marital_counts)))
    axes[2].set_title(f'{month} Marital Status Distribution', fontsize=14, fontweight='bold')

    # 3. Education Level Distribution
    #education_counts = monthly_data['education'].value_counts()
    #sns.barplot(ax=axes[3], x=education_counts.index, y=education_counts.values, color=color)
    #axes[3].set_title(f'{month} Education Level Distribution', fontsize=14, fontweight='bold')
    #axes[3].set_xlabel('Education Level')
    #axes[3].set_ylabel('Number of Customers')
    #axes[3].tick_params(axis='x', rotation=45, ha='left') # Corrected line
    #axes[3].grid(axis='y', linestyle='--', alpha=0.6)

    # 4. Balance Distribution
    sns.histplot(ax=axes[4], data=monthly_data, x='balance', bins=50, kde=True, color=color)
    axes[4].set_title(f'{month} Balance Distribution', fontsize=14, fontweight='bold')
    axes[4].set_xlabel('Account Balance')
    axes[4].set_ylabel('Number of Customers')
    axes[4].grid(True, linestyle='--', alpha=0.6)

    # 5. Housing Loan vs Personal Loan
    loan_housing = monthly_data.groupby(['housing', 'loan']).size().unstack()
    loan_housing.plot(kind='bar', stacked=True, ax=axes[5], color=sns.color_palette()[:2])
    axes[5].set_title(f'{month} Housing Loan vs Personal Loan', fontsize=14, fontweight='bold')
    axes[5].set_xlabel('Housing Loan')
    axes[5].set_ylabel('Number of Customers')
    axes[5].tick_params(axis='x', rotation=0)
    axes[5].legend(title='Personal Loan')
    axes[5].grid(axis='y', linestyle='--', alpha=0.6)

    # 6. Contact Method Distribution
    contact_counts = monthly_data['contact'].value_counts()
    sns.barplot(ax=axes[6], x=contact_counts.index, y=contact_counts.values, color=color)
    axes[6].set_title(f'{month} Contact Method Distribution', fontsize=14, fontweight='bold')
    axes[6].set_xlabel('Contact Method')
    axes[6].set_ylabel('Number of Customers')
    axes[6].grid(axis='y', linestyle='--', alpha=0.6)

    # 7. Day of the Week Distribution
    dow_counts = monthly_data['day_of_week'].value_counts().sort_index()
    sns.barplot(ax=axes[7], x=dow_counts.index, y=dow_counts.values, color=color)
    axes[7].set_title(f'{month} Day of the Week of Contact', fontsize=14, fontweight='bold')
    axes[7].set_xlabel('Day of the Week')
    axes[7].set_ylabel('Number of Contacts')
    axes[7].grid(axis='y', linestyle='--', alpha=0.6)

    # 8. Duration of Calls Distribution
    sns.histplot(ax=axes[8], data=monthly_data, x='duration', bins=50, kde=True, color=color)
    axes[8].set_title(f'{month} Call Duration Distribution', fontsize=14, fontweight='bold')
    axes[8].set_xlabel('Call Duration (seconds)')
    axes[8].set_ylabel('Number of Calls')
    axes[8].grid(True, linestyle='--', alpha=0.6)

    # 9. Campaign Interactions Distribution
    sns.histplot(ax=axes[9], data=monthly_data, x='campaign', bins=20, kde=True, color=color)
    axes[9].set_title(f'{month} Campaign Interactions', fontsize=14, fontweight='bold')
    axes[9].set_xlabel('Number of Campaign Contacts')
    axes[9].set_ylabel('Number of Customers')
    axes[9].grid(True, linestyle='--', alpha=0.6)

    # 10. Previous Contacts Outcome
    poutcome_counts = monthly_data['poutcome'].value_counts()
    sns.barplot(ax=axes[10], x=poutcome_counts.index, y=poutcome_counts.values, color=color)
    axes[10].set_title(f'{month} Previous Campaign Outcome', fontsize=14, fontweight='bold')
    axes[10].set_xlabel('Previous Outcome')
    axes[10].set_ylabel('Number of Customers')
    axes[10].grid(axis='y', linestyle='--', alpha=0.6)

    plt.tight_layout()
    plt.show()

    # --- Interactive Visualizations with Plotly ---
    # 1. Age Distribution (Interactive Histogram)
    fig_hist_age = go.Figure(data=[go.Histogram(x=monthly_data['age'], nbinsx=30, marker_color=color)])
    fig_hist_age.update_layout(
        title=f'{month} Age Distribution',
        xaxis_title='Age',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_hist_age.show()

    # 2. Marital Status Distribution (Interactive Pie Chart)
    fig_pie_marital = go.Figure(data=[go.Pie(labels=marital_counts.index, values=marital_counts.values, marker_colors=sns.color_palette(n_colors=len(marital_counts)))])
    fig_pie_marital.update_layout(
        title=f'{month} Marital Status Distribution',
        template='plotly_white'
    )
    fig_pie_marital.show()

    # 3. Education Level Distribution (Interactive Bar Chart)
    fig_bar_edu = go.Figure(data=[go.Bar(x=education_counts.index, y=education_counts.values, marker_color=color)])
    fig_bar_edu.update_layout(
        title=f'{month} Education Level Distribution',
        xaxis_title='Education Level',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_bar_edu.show()

    # 4. Balance Distribution (Interactive Histogram)
    fig_hist_balance = go.Figure(data=[go.Histogram(x=monthly_data['balance'], nbinsx=50, marker_color=color)])
    fig_hist_balance.update_layout(
        title=f'{month} Balance Distribution',
        xaxis_title='Account Balance',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_hist_balance.show()

    # 5. Housing Loan vs Personal Loan (Interactive Grouped Bar Chart)
    fig_bar_loan = go.Figure(data=[
        go.Bar(name='No Loan', x=loan_housing.index, y=loan_housing.get(('no', 'no'), [0]*len(loan_housing.index)) + loan_housing.get(('yes', 'no'), [0]*len(loan_housing.index)), marker_color=sns.color_palette()[0]),
        go.Bar(name='Personal Loan', x=loan_housing.index, y=loan_housing.get(('no', 'yes'), [0]*len(loan_housing.index)) + loan_housing.get(('yes', 'yes'), [0]*len(loan_housing.index)), marker_color=sns.color_palette()[1])
    ])
    fig_bar_loan.update_layout(
        title=f'{month} Housing Loan vs Personal Loan',
        xaxis_title='Has Housing Loan?',
        yaxis_title='Number of Customers',
        barmode='group',
        template='plotly_white'
    )
    fig_bar_loan.show()

    # 6. Contact Method Distribution (Interactive Bar Chart)
    fig_bar_contact = go.Figure(data=[go.Bar(x=contact_counts.index, y=contact_counts.values, marker_color=color)])
    fig_bar_contact.update_layout(
        title=f'{month} Contact Method Distribution',
        xaxis_title='Contact Method',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_bar_contact.show()

    # 7. Day of the Week of Contact (Interactive Bar Chart)
    fig_bar_dow = go.Figure(data=[go.Bar(x=dow_counts.index.map({1: 'Mon', 2: 'Tue', 3: 'Wed', 4: 'Thu', 5: 'Fri'}), y=dow_counts.values, marker_color=color)])
    fig_bar_dow.update_layout(
        title=f'{month} Day of the Week of Contact',
        xaxis_title='Day of the Week',
        yaxis_title='Number of Contacts',
        template='plotly_white'
    )
    fig_bar_dow.show()

    # 8. Call Duration Distribution (Interactive Histogram)
    fig_hist_duration = go.Figure(data=[go.Histogram(x=monthly_data['duration'], nbinsx=50, marker_color=color)])
    fig_hist_duration.update_layout(
        title=f'{month} Call Duration Distribution',
        xaxis_title='Call Duration (seconds)',
        yaxis_title='Number of Calls',
        template='plotly_white'
    )
    fig_hist_duration.show()

    # 9. Campaign Interactions (Interactive Histogram)
    fig_hist_campaign = go.Figure(data=[go.Histogram(x=monthly_data['campaign'], nbinsx=20, marker_color=color)])
    fig_hist_campaign.update_layout(
        title=f'{month} Campaign Interactions',
        xaxis_title='Number of Campaign Contacts',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_hist_campaign.show()

    # 10. Previous Campaign Outcome (Interactive Bar Chart)
    fig_bar_poutcome = go.Figure(data=[go.Bar(x=poutcome_counts.index, y=poutcome_counts.values, marker_color=color)])
    fig_bar_poutcome.update_layout(
        title=f'{month} Previous Campaign Outcome',
        xaxis_title='Previous Outcome',
        yaxis_title='Number of Customers',
        template='plotly_white'
    )
    fig_bar_poutcome.show()

# First, let's see the columns to identify the month column
print(data.columns)

# Call the function
analyze_month_adapted(data, 'may', 'skyblue', month_column='month')