In [None]:
#* IsActiveMember has a small negative correlation with Exited 

# Create a groupby
counts = df.groupby(['IsActiveMember', 'Exited']).size().unstack()

# Define the X-labels
labels = ['Inactive Member', 'Active Member']

# Plot a grouped bar chart
counts.plot(kind='bar', stacked=False)

# Set labels and title
plt.xlabel('')
plt.ylabel('Count')
plt.title('Comparison of IsActiveMember and Exited')
plt.xticks(range(len(labels)), labels, rotation=0)
plt.legend(labels=['Remained', 'Exited'])

# Show the plot
plt.show()

#* Looks like a rather significant pattern of Active Members being less likely to exit
# Still , would help alot with insight if we knew what constituted active and not-active membership

In [None]:
import pandas as pd

#* Set Balance max to 300_000

# Create a sample DataFrame
data = {'values': [0, 500, 1500, 2000, 300000]}

# Define the bins
bins = [0, 1, 1000, 10000]

# Bin the 'values' column
df['binned'] = pd.cut(df['Balance'], bins=bins, right=False)

# Display the resulting DataFrame
print(df)
# df['bin'] = pd.cut(df['Balance'], [0, 1, 50, 100,200,300000])
df['bin'] = pd.cut(df['Balance'], [-1, 0, 1, 50, 100, 200, 300000])


df.describe()

In [None]:
# Assuming your data is in the 'another_column' column of the DataFrame
another_column_subset = df['Balance'].values.reshape(-1, 1)

# Create a KBinsDiscretizer object with 5 bins and the quantile strategy
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')

# Fit and transform the data using the discretizer
another_column_binned = discretizer.fit_transform(another_column_subset)

# Assign the discretized values to a new column in the DataFrame
df['BalanceQuant'] = another_column_binned.flatten()

In [None]:
import pandas as pd

tax_brackets = {
    'France': [(0, 10225), (10226, 26070), (26071, 74545), (74546, 160336), (160337, float('inf'))], # For 2022
    'Germany': [(0, 10347), (10909, 15999), (16000, 62809), (62810, 277825), (277826, float('inf'))], # For 2023
    'Spain': [(0, 12450), (12451, 20200), (20201, 35200), (35201, 100000), (100001, float('inf'))], # For 2021. Spain has 6 brackets, so we are combining the highest 2
}

def get_tax_bracket(country, salary):
    tax_ranges = tax_brackets[country]
    for index, tax_range in enumerate(tax_ranges):
        if tax_range[0] <= salary <= tax_range[1]:
            return index
    return None

df['taxBracket'] = df.apply(lambda row: get_tax_bracket(row['Country'], row['EstimatedSalary']), axis=1)


In [None]:
#
#! This is freqeuncy encoding

# Calculate the frequency of occurrence for each value in the 'Surname' column
value_frequencies = df['Surname'].value_counts()

# Sort the values based on their frequency in descending order
sorted_values = value_frequencies.index

# Assign ordinal values to the sorted values
ordinal_values = range(len(sorted_values))

# Create a dictionary mapping the sorted values to their ordinal values
encoding_dict = dict(zip(sorted_values, ordinal_values))

# Use the dictionary to map the 'Surname' column values to their ordinal values in a new column
df['SurnameOrdinal'] = df['Surname'].map(encoding_dict)

In [None]:
osborne_rows = df[df['Surname'] == 'Smith']
print(len(osborne_rows))
osborne_rows

In [None]:
# create a pipline (if you have time) for unprocessed pediction data
# Training phase
# Step 1: Train clustering algorithm on training data
kmeans = KMeans(n_clusters=3)
kmeans.fit(training_data)

# Step 2: Add cluster labels as a new feature to the training data
training_data['Cluster'] = kmeans.labels_

# Step 3: Train your model using the training data, including the cluster feature
model.fit(training_data, target_variable)

# Prediction phase
# Step 4: Prepare new data for prediction
new_data = preprocess(new_data)  # Apply the same preprocessing steps as used for training
new_clusters = kmeans.predict(new_data)  # Obtain cluster labels for new data
new_data['Cluster'] = new_clusters  # Add cluster labels as a new feature

# Make predictions using the trained model
predictions = model.predict(new_data)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the model
model_krs = Sequential()
model_krs.add(Dense(1024, input_shape=(X_train.shape[1],), activation='relu'))
model_krs.add(Dropout(0.2))
model_krs.add(Dense(1024, activation='relu'))
model_krs.add(Dropout(0.2))
model_krs.add(Dense(1, activation='sigmoid'))

# Compile the model
model_krs.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model_krs.summary()

In [None]:
accuracy = model_krs.evaluate(X_train, y_train, verbose=False)
print("Training Score: {:.4f}".format(accuracy[0]))
print("Training Accuracy: {:.4f}".format(accuracy[1]))


In [None]:
def plot_history(fit_keras):
    acc = fit_keras.history['accuracy']
    val_acc = fit_keras.history['val_accuracy']
    loss = fit_keras.history['loss']
    val_loss = fit_keras.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Testing acc')
    plt.title('Training and Testing accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Testing loss')
    plt.title('Training and Testing loss')
    plt.legend()

In [None]:
prep = ColumnTransformer(
    transformers=[
        ('target', TargetEncoder(), continuous_features),
        ('scaler', StandardScaler(), continuous_features),
        ('ordinal', 'passthrough', ordinal_features),
        ('cluster', 'passthrough', cluster_features),
        ('categorical', 'passthrough', categorical_features)      
    ])