# Explore here

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
from patsy import dmatrices


'''Problem Statement: 
\The important insurance company 4Geeks Insurance S.L. wants to calculate, 
based on physiological data of its customers what will be the premium (cost) to be borne by each of them. 
To do this, it has assembled a whole team of doctors and based on data from other companies 
and a particular study have managed to gather a set of data to train a predictive model.\n'''


# Read data
total_data = pd.read_csv('/workspaces/machine-learning-python-template-ds-2023/Ryan/raw/medical_insurance.csv')

# Obtain information about data types and non-null values
print(total_data.info())

# See data
print(total_data.head())

# Obtain dimensions
print(f'''{total_data.shape}\n''')





age. Age of primary beneficiary (numeric)
sex. Gender of the primary beneficiary (categorical)
bmi. Body mass index (numeric)
children. Number of children/dependents covered by health insurance (numeric)
smoker. smoker (categorical)
region. Beneficiary's residential area in the U.S.: northeast, southeast, southwest, northwest (categorical)
charges. Health insurance premium (numerical)





# Convert numeric
column_names = ["age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "duration", "campaign", "pdays", "previous"]

for col in column_names:
    total_data[col] = total_data[col].astype(float)


# Factorize objects and print
factorize_column_names = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome", "y"]

for col in factorize_column_names:
    labels, unique_categories = pd.factorize(total_data[col])
    total_data[col + '_encoded'] = labels  # Adding an encoded column
    print(f"Factorization for {col}:")
    for index_r, category_r in enumerate(unique_categories):
        print(f"{category_r}: {index_r}")
    print("\n")

# Check for duplicates
print(f'''Duplicated: {total_data.duplicated().sum()}''')
duplicates = total_data[total_data.duplicated()]
print(duplicates)
total_data.drop_duplicates(inplace=True)

'''# Check heatmap for float categories
fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(total_data[["y_encoded", "age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed", "duration", "campaign", "pdays", "previous"]].corr(), annot = True, fmt = ".2f")
plt.tight_layout()
plt.show()

# Check heatmap for factorized categories
fig, axis = plt.subplots(figsize = (10, 6))

sns.heatmap(total_data[["y_encoded", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded", "month_encoded", "day_of_week_encoded", "poutcome_encoded"]].corr(), annot = True, fmt = ".2f")
plt.tight_layout()
plt.show()

# Check pairplot
sns.pairplot(data=total_data, vars=["y_encoded", "age", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded"])
plt.show()
   

# Compare filtered data
# Separate data based on y_encoded
y_filtered_data = total_data[total_data['y_encoded'] == 1]
n_filtered_data = total_data[total_data['y_encoded'] == 0]

# Plot histograms

hist_columns = ["age", "job_encoded", "marital_encoded", "education_encoded", "default_encoded", "housing_encoded", "loan_encoded", "contact_encoded"]

for column in hist_columns:
    plt.figure(figsize=(10, 6))
    
    ax = plt.gca()  # Get current axes to be used for plotting and retrieving patches

    # Plot data
    sns.histplot(y_filtered_data[column], kde=False, stat='probability', label='y_encoded = 1', bins=30, color='blue', ax=ax)
    sns.histplot(n_filtered_data[column], kde=False, stat='probability', label='y_encoded = 0', bins=30, color='red', alpha=0.5, ax=ax)

    # Getting all the rectangles (bar patches) in the current axes
    patches = ax.patches

    # Iterate over bar heights and annotate with their percentage
    for patch in patches:
        bar = patch.get_height()
        # Annotate the percentage on top of the bars
        plt.annotate(f"{round(bar*100)}%", (patch.get_x() + patch.get_width() / 2., bar), 
                     ha='center', va='center', xytext=(0,10), textcoords='offset points')
    
    # Set label and legends
    plt.xlabel(column)
    plt.legend()
    plt.tight_layout()
    plt.show()



age. Age of customer (numeric)
job. Type of job (categorical)
marital. Marital status (categorical)
education. Level of education (categorical)
default. do you currently have credit (categorical) 6. housing.
contact. Type of contact communication (categorical)

housing. do you have a housing loan (categorical) 7. loan.
loan. Do you have a personal loan? (categorical)

month. Last month in which you have been contacted (categorical)
day_of_week. Last day on which you have been contacted (categorical)
duration. Duration of previous contact in seconds (numeric)
campaign. Number of contacts made during this campaign to the customer (numeric)
pdays. Number of days that elapsed since the last campaign until the customer was contacted (numeric)
previous. Number of contacts made during the previous campaign to the customer (numeric)
poutcome. Result of the previous marketing campaign (categorical).

emp.var.rate. Employment variation rate. Quarterly indicator (numeric)
cons.price.idx. Consumer price index. Monthly indicator (numeric)
cons.conf.idx. Consumer confidence index. Monthly indicator (numeric)
euribor3m. EURIBOR 3-month rate. Daily indicator (numeric)
nr.employed. Number of employees. Quarterly indicator (numeric)
'''

# Remove null values

filtered_data = total_data[
    (total_data['job_encoded'] != 9) & 
    (total_data['marital_encoded'] != 3) & 
    (total_data['education_encoded'] != 5) & 
    (total_data['default_encoded'] != 1) & 
    (total_data['housing_encoded'] != 2) & 
    (total_data['loan_encoded'] != 2) & 
    (total_data['contact_encoded'] != 2)
]


columns_to_select = ['y_encoded', 'age', 'job_encoded', 'marital_encoded', 'education_encoded', 'default_encoded', 'contact_encoded']  # replace with the names of columns you want
selected_data = filtered_data[columns_to_select]


# Exclude target from normalization
num_features = ['age', 'job_encoded', 'marital_encoded', 'education_encoded', 'default_encoded', 'contact_encoded']

# Normalize features
scaler = StandardScaler()
norm_features = scaler.fit_transform(selected_data[num_features])

# Create the normalized DataFrame
selected_data_norm = pd.DataFrame(norm_features, index=selected_data.index, columns=num_features)

# Assign the original 'y_encoded' to the normalized DataFrame
selected_data_norm["y_encoded"] = selected_data["y_encoded"]


# Define the features and the target
features = ['age', 'job_encoded', 'marital_encoded', 'education_encoded', 'default_encoded', 'contact_encoded']
target = 'y_encoded'

# Separate the predictors from the label
X = selected_data_norm[features]
y = selected_data_norm[target]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, train_size = 0.80)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(y_pred)

print(f'''Accuracy: {accuracy_score(y_test, y_pred)}''')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             41188 non-null  object
 1   job             41188 non-null  object
 2   marital         41188 non-null  object
 3   education       41188 non-null  object
 4   default         41188 non-null  object
 5   housing         41188 non-null  object
 6   loan            41188 non-null  object
 7   contact         41188 non-null  object
 8   month           41188 non-null  object
 9   day_of_week     41188 non-null  object
 10  duration        41188 non-null  object
 11  campaign        41188 non-null  object
 12  pdays           41188 non-null  object
 13  previous        41188 non-null  object
 14  poutcome        41188 non-null  object
 15  emp.var.rate    41188 non-null  object
 16  cons.price.idx  41188 non-null  object
 17  cons.conf.idx   41188 non-null  object
 18  euribo