# Explore here

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import patsy
from patsy import dmatrices
import statsmodels.api as sm


'''Problem Statement: 
Long-term deposits allow banks to hold money for a specific period of time, 
allowing the bank to use that money to enhance its investments. 
Marketing campaigns for this product are based on phone calls.
If a user is not available at a given time, then they will be called back at another time.

**Description of the problem

The Portuguese bank is experiencing a decline in revenue,
so they want to be able to identify existing customers who are more likely to take out a long-term deposit. 
This will allow the bank to focus their marketing efforts on those customers 
and avoid wasting money and time on customers who are unlikely to sign up.
To address this problem we will create a ranking algorithm to help predict whether or not a customer will sign up for a long-term deposit.

Using EDA, Logistic Regression, Optimization\n'''


# Read data
# Read the file as plain text
with open('/workspaces/machine-learning-python-template-ds-2023/Ryan/raw/bank_data.csv', 'r') as f:
    lines = f.readlines()

# Process each line to remove extraneous quotes
processed_lines = []
for line in lines:
    # Remove quotes and split by semicolons
    line = line.replace('""', '"')  # Handle double quotes
    line = line.strip().split(';')
    processed_line = [item.replace('"', '') for item in line]
    processed_lines.append(processed_line)

# Create dataframe from the processed lines
total_data = pd.DataFrame(processed_lines[1:], columns=processed_lines[0])

# Obtain information about data types and non-null values
print(total_data.info())

# See data
print(total_data.head())

# Obtain dimensions
print(f'''{total_data.shape}\n''')

# Convert numeric
column_names = ["age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "duration", "campaign", "pdays", "previous"]

for col in column_names:
    total_data[col] = total_data[col].astype(float)


# Factorize objects and print
factorize_column_names = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y"]

for col in factorize_column_names:
    labels, unique_categories = pd.factorize(total_data[col])
    total_data[col + '_encoded'] = labels  # Adding an encoded column
    print(f"Factorization for {col}:")
    for index_r, category_r in enumerate(unique_categories):
        print(f"{category_r}: {index_r}")
    print("\n")

# Check for duplicates
print(f'''Duplicated: {total_data.duplicated().sum()}''')
duplicates = total_data[total_data.duplicated()]
print(duplicates)
total_data.drop_duplicates(inplace=True)

'''# Check heatmap for float categories
fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(total_data[["y_encoded", "age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "duration", "campaign", "pdays", "previous"]].corr(), annot = True, fmt = ".2f")
plt.tight_layout()
plt.show()

# Check heatmap for factorized categories
fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(total_data[["y_encoded", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded", "month_encoded", "day_of_week_encoded", "poutcome_encoded"]].corr(), annot = True, fmt = ".2f")
plt.tight_layout()
plt.show()

# Check pairplot
sns.pairplot(data=total_data, vars=["y_encoded", "age", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded"])
plt.show()
   

# Compare filtered data
# Separate data based on y_encoded
y_filtered_data = total_data[total_data['y_encoded'] == 1]
n_filtered_data = total_data[total_data['y_encoded'] == 0]

# Plot histograms

hist_columns = ["age", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded"]

for column in hist_columns:
    plt.figure(figsize=(10, 6))
    
    ax = plt.gca()  # Get current axes to be used for plotting and retrieving patches

    # Plot data
    sns.histplot(y_filtered_data[column], kde=False, stat='probability', label='y_encoded = 1', bins=30, color='blue', ax=ax)
    sns.histplot(n_filtered_data[column], kde=False, stat='probability', label='y_encoded = 0', bins=30, color='red', alpha=0.5, ax=ax)

    # Getting all the rectangles (bar patches) in the current axes
    patches = ax.patches

    # Iterate over bar heights and annotate with their percentage
    for patch in patches:
        bar = patch.get_height()
        # Annotate the percentage on top of the bars
        plt.annotate(f"{round(bar*100)}%", (patch.get_x() + patch.get_width() / 2., bar), 
                     ha='center', va='center', xytext=(0,10), textcoords='offset points')
    
    # Set label and legends
    plt.xlabel(column)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Create histogram for room_type
sns.histplot(data = filtered_data, x = "default_encoded")
plt.show()

age. Age of customer (numeric)
job. Type of job (categorical)
marital. Marital status (categorical)
education. Level of education (categorical)
contact. Type of contact communication (categorical)

default. do you currently have credit (categorical) 6. housing.
housing. do you have a housing loan (categorical) 7. loan.
loan. Do you have a personal loan? (categorical)

month. Last month in which you have been contacted (categorical)
day_of_week. Last day on which you have been contacted (categorical)
duration. Duration of previous contact in seconds (numeric)
campaign. Number of contacts made during this campaign to the customer (numeric)
pdays. Number of days that elapsed since the last campaign until the customer was contacted (numeric)
previous. Number of contacts made during the previous campaign to the customer (numeric)
poutcome. Result of the previous marketing campaign (categorical).

emp.var.rate. Employment variation rate. Quarterly indicator (numeric)
cons.price.idx. Consumer price index. Monthly indicator (numeric)
cons.conf.idx. Consumer confidence index. Monthly indicator (numeric)
euribor3m. EURIBOR 3-month rate. Daily indicator (numeric)
nr.employed. Number of employees. Quarterly indicator (numeric)
'''

# Remove null values

filtered_data = total_data[
    (total_data['job_encoded'] != 9) & 
    (total_data['marital_encoded'] != 3) & 
    (total_data['education_encoded'] != 5) & 
    (total_data['housing_encoded'] != 2) & 
    (total_data['loan_encoded'] != 2) & 
    (total_data['contact_encoded'] != 2)
]

columns_to_select = ['y_encoded', 'age', 'job_encoded', 'marital_encoded', 'education_encoded', 'contact_encoded']  # replace with the names of columns you want
selected_data = filtered_data[columns_to_select]

#formula1 = 'y_encoded ~ 1+age+job_encoded+marital_encoded+education_encoded+contact_encoded'
#formula1 = 'y_encoded ~ 1+job_encoded+marital_encoded+education_encoded+contact_encoded'
#formula1 = 'y_encoded ~ 1+marital_encoded+education_encoded+contact_encoded'
#formula1 = 'y_encoded ~ 1+education_encoded+contact_encoded'
formula1 = 'y_encoded ~ 1+contact_encoded'
y,X = dmatrices(formula1,selected_data,return_type='dataframe')

# Split data first
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

# Now fit the model only on training data
model = sm.Logit(y_train, X_train).fit()

print(model.summary())

# Predict probabilities on the test set
y_pred_prob = model.predict(X_test)

# Convert probabilities to binary outcome
y_pred = np.where(y_pred_prob >= 0.5, 1, 0)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             41188 non-null  object
 1   job             41188 non-null  object
 2   marital         41188 non-null  object
 3   education       41188 non-null  object
 4   default         41188 non-null  object
 5   housing         41188 non-null  object
 6   loan            41188 non-null  object
 7   contact         41188 non-null  object
 8   month           41188 non-null  object
 9   day_of_week     41188 non-null  object
 10  duration        41188 non-null  object
 11  campaign        41188 non-null  object
 12  pdays           41188 non-null  object
 13  previous        41188 non-null  object
 14  poutcome        41188 non-null  object
 15  emp.var.rate    41188 non-null  object
 16  cons.price.idx  41188 non-null  object
 17  cons.conf.idx   41188 non-null  object
 18  euribo