In [None]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
import umap

In [None]:

df = pd.read_parquet('becode_file.parquet')


#### Drop features that are irrelevant or unnecessary.

In [None]:
columns_to_drop = ['last_redeem_date', 'forelast_switch_dte', 'survey_interaction_type', 'nps_score',
                   'contr_nr', 'dim_cust_nr','wifi_days_used', 'wifi_weighted_drops', 'wifi_n_devices',
                   'wifi_total_traffic', 'wifi_n_sessions_t5', 'wifi_total_pings_t5', 'wifi_total_active_pings_t5',
                   'wifi_mean_active_connduration_t5', 'wifi_total_traffic_t5', 'wifi_total_retransmitted_traffic_t5',
                   'wifi_mean_active_deff_t5', 'wifi_std_active_deff_t5', 'wifi_active_deff_score_t5',
                    'todt_rono_roof_avg3m_vol', 'tovc_rono_roof_avg3m_dur', 'tosm_rono_roof_avg3m_evt',
                    'todt_rono_roof_vol', 'tovc_rono_roof_dur', 'tosm_rono_roof_evt', 'pre_days_since_last_reload',
                    'pre_most_frequent_amt', 'pre_count_tot_rel', 'net_total_paging_attempts', 'net_total_call_attempts',
                    'post_remaining_loyalty_days', 'mob_mobility', 'mob_variance_in_mobility', 'post_device_chg_days',
                    'post_contr_oob_m1_rev', 'post_contr_oob_m2_rev', 'post_contr_oob_m3_rev', 'post_tdt_rev',
                    'post_tvc_rev', 'installation_days', 'move_days', 'mesh_wifi', 'internet_boost', 'extra_decoder',
                    'chromecast', 'foot_option', 'mobile_insurance', 'cable_easy_switch_origin', 'tv_decoder_type',
                    'tot_nbr_subsidy', 'tot_nbr_instalment', 'post_dunning_last_hope_1m', 'post_dunning_reminder1_1m',
                    'post_dunning_reminder2_1m', 'post_dunning_reminder3_1m', 'post_dunning_reminder4_1m', 'highest_reup',
                    'lowest_reup', 'latest_reup', 'highest_upfront', 'lowest_upfront', 'latest_upfront', 'months_since_latest_subsidy_start',
                    'months_since_latest_instal_start', 'nbr_sub_inst_iphones', 'nbr_sub_inst_samsung', 'nbr_sub_inst_other_brands',
                    'sub_inst_latest_device_brand', 'convergence_tenure', 'handset_model_ten_yr_at_chng', 'days_since_post_latest',
                    'days_since_pre_latest', 'operator_switch_cnt_y', 'home_int_from_start'
                    ]
len(columns_to_drop)

In [None]:
df_less = df.drop(columns=columns_to_drop)

#### Encoding or categorical features.

In [None]:
encoder = OrdinalEncoder()
categorical_columns = df_less.select_dtypes('object').columns
df_less[categorical_columns] = encoder.fit_transform(df_less[categorical_columns])

#### Imputing of missing values.

In [None]:
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df_less), columns=df_less.columns)

#### Normalization of data.

In [None]:
sc = StandardScaler()
scaled_data_array = sc.fit_transform(df_imputed)
scaled_data = pd.DataFrame(scaled_data_array, columns = df_imputed.columns)

#### Feature reduction.

In [None]:
reducer = umap.UMAP(n_neighbors=500, n_components=15, verbose=True)
embedding = reducer.fit_transform(scaled_data)

#### Kmeans clustering.

In [None]:
kmeans = KMeans(n_clusters=8)
X = embedding
kmeans.fit(X)

In [None]:
clusters = kmeans.predict(X)
scaled_data['clusters'] = clusters

#### Means comparison between clusters to extract the most important features for each cluster. 

In [None]:
# Calculate difference between cluster zero and other clusters.
cluster_zero = scaled_data[scaled_data['clusters'] == 0]
all_minus_zero = scaled_data[scaled_data['clusters'] != 0]
difference_zero = cluster_zero.mean().subtract(all_minus_zero.mean())
# Set threshold to filter out non important features.
filter_zero = difference_zero.loc[(difference_zero > 1.2) | (difference_zero < -1.2)]
# Get list of important features for this cluster.
scaled_data_zero = filter_zero.to_frame().transpose()
columns_zero = scaled_data_zero.columns.values.tolist()

# Repeat this for all the other clusters.

In [None]:
cluster_one = scaled_data[scaled_data['clusters'] == 1]
all_minus_one = scaled_data[scaled_data['clusters'] != 1]
difference_one = cluster_one.mean().subtract(all_minus_one.mean())
filter_one =difference_one.loc[(difference_one > 1.0) | (difference_one < -1.0)]
scaled_data_one = filter_one.to_frame().transpose()
columns_one = scaled_data_one.columns.values.tolist()

In [None]:
cluster_two = scaled_data[scaled_data['clusters'] == 2]
all_minus_two = scaled_data[scaled_data['clusters'] != 2]
difference_two = cluster_two.mean().subtract(all_minus_two.mean())
filter_two =difference_two.loc[(difference_two > 1.0) | (difference_two < -1.0)]
scaled_data_two = filter_two.to_frame().transpose()
columns_two = scaled_data_two.columns.values.tolist()

In [None]:
cluster_three = scaled_data[scaled_data['clusters'] == 3]
all_minus_three = scaled_data[scaled_data['clusters'] != 3]
difference_three = cluster_three.mean().subtract(all_minus_three.mean())
filter_three =difference_three.loc[(difference_three > 1.1) | (difference_three < -1.0)]
scaled_data_three = filter_three.to_frame().transpose()
columns_three = scaled_data_three.columns.values.tolist()

In [None]:
cluster_four = scaled_data[scaled_data['clusters'] == 4]
all_minus_four = scaled_data[scaled_data['clusters'] != 4]
difference_four = cluster_four.mean().subtract(all_minus_four.mean())
filter_four =difference_four.loc[(difference_four > 1.1) | (difference_four < -1.0)]
scaled_data_four = filter_four.to_frame().transpose()
columns_four = scaled_data_four.columns.values.tolist()

In [None]:
cluster_five = scaled_data[scaled_data['clusters'] == 5]
all_minus_five = scaled_data[scaled_data['clusters'] != 5]
difference_five = cluster_five.mean().subtract(all_minus_five.mean())
filter_five =difference_five.loc[(difference_five > 1.) | (difference_five < -1.0)]
scaled_data_five = filter_five.to_frame().transpose()
columns_five = scaled_data_five.columns.values.tolist()

In [None]:
cluster_six = scaled_data[scaled_data['clusters'] == 6]
all_minus_six = scaled_data[scaled_data['clusters'] != 6]
difference_six = cluster_six.mean().subtract(all_minus_six.mean())
filter_six =difference_six.loc[(difference_six > 1.) | (difference_six < -1.0)]
scaled_data_six = filter_six.to_frame().transpose()
columns_six = scaled_data_six.columns.values.tolist()

In [None]:
cluster_seven = scaled_data[scaled_data['clusters'] == 7]
all_minus_seven = scaled_data[scaled_data['clusters'] != 7]
difference_seven = cluster_seven.mean().subtract(all_minus_seven.mean())
filter_seven =difference_seven.loc[(difference_seven > 1.1) | (difference_seven < -1.0)]
scaled_data_seven = filter_seven.to_frame().transpose()
columns_seven = scaled_data_seven.columns.values.tolist()

#### Interpretation of clusters in original dataframe.

In [None]:
df['clusters'] = clusters

In [None]:
zero_cluster = df[df['clusters'] == 0]
# Filter only the important features for this cluster.
zero_cluster = zero_cluster[columns_zero]
# Output the precentage of the unique values for each feature in this cluster
for c in zero_cluster.columns:
    print(c)
    print(zero_cluster[c].value_counts() / len(zero_cluster) * 100)
    print("-----------------")
print("################################################################")
# Output the precentage of the unique values for each feature for the other clusters
print("other clusters")
zero_cluster = df[df['clusters'] != 0]
zero_cluster = zero_cluster[columns_zero]
for c in zero_cluster.columns:
    print(c)
    print(zero_cluster[c].value_counts() / len(zero_cluster) * 100)
    print("-----------------")

# Repeat this for all clusters.
# Use these outputs to interpret the clusters.    

In [None]:
first_cluster = df[df['clusters'] == 1]
first_cluster = first_cluster[columns_one]
for c in first_cluster.columns:
    print(c)
    print(first_cluster[c].value_counts() / len(first_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
first_cluster = df[df['clusters'] != 1]
first_cluster = first_cluster[columns_one]
for c in first_cluster.columns:
    print(c)
    print(first_cluster[c].value_counts() / len(first_cluster) * 100)
    print("-----------------")

In [None]:
second_cluster = df[df['clusters'] == 2]
second_cluster = second_cluster[columns_two]
for c in second_cluster.columns:
    print(c)
    print(second_cluster[c].value_counts() / len(second_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
second_cluster = df[df['clusters'] != 2]
second_cluster = second_cluster[columns_two]
for c in second_cluster.columns:
    print(c)
    print(second_cluster[c].value_counts() / len(second_cluster) * 100)
    print("-----------------")

In [None]:
third_cluster = df[df['clusters'] == 3]
third_cluster = third_cluster[columns_three]
for c in third_cluster.columns:
    print(c)
    print(third_cluster[c].value_counts() / len(third_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
third_cluster = df[df['clusters'] != 3]
third_cluster = third_cluster[columns_three]
for c in third_cluster.columns:
    print(c)
    print(third_cluster[c].value_counts() / len(third_cluster) * 100)
    print("-----------------")

In [None]:
fourth_cluster = df[df['clusters'] == 4]
fourth_cluster = fourth_cluster[columns_four]
for c in fourth_cluster.columns:
    print(c)
    print(fourth_cluster[c].value_counts() / len(fourth_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
fourth_cluster = df[df['clusters'] != 4]
fourth_cluster = fourth_cluster[columns_four]
for c in fourth_cluster.columns:
    print(c)
    print(fourth_cluster[c].value_counts() / len(fourth_cluster) * 100)
    print("-----------------")

In [None]:
fifth_cluster = df[df['clusters'] == 5]
fifth_cluster = fifth_cluster[columns_five]
for c in fifth_cluster.columns:
    print(c)
    print(fifth_cluster[c].value_counts() / len(fifth_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
fifth_cluster = df[df['clusters'] != 5]
fifth_cluster = fifth_cluster[columns_five]
for c in fifth_cluster.columns:
    print(c)
    print(fifth_cluster[c].value_counts() / len(fifth_cluster) * 100)
    print("-----------------")


In [None]:
sixth_cluster = df[df['clusters'] == 6]
sixth_cluster = sixth_cluster[columns_six]
sixth_cluster.nunique()
for c in sixth_cluster.columns:
    print(c)
    print(sixth_cluster[c].value_counts() / len(sixth_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
sixth_cluster = df[df['clusters'] != 6]
sixth_cluster = sixth_cluster[columns_six]
sixth_cluster.nunique()
for c in sixth_cluster.columns:
    print(c)
    print(sixth_cluster[c].value_counts() / len(sixth_cluster) * 100)
    print("-----------------")

In [None]:
seventh_cluster = df[df['clusters'] == 7]
seventh_cluster = seventh_cluster[columns_seven]
for c in seventh_cluster.columns:
    print(c)
    print(seventh_cluster[c].value_counts() / len(seventh_cluster) * 100)
    print("-----------------")
print("################################################################")
print("other clusters")
seventh_cluster = df[df['clusters'] != 7]
seventh_cluster = seventh_cluster[columns_seven]
for c in seventh_cluster.columns:
    print(c)
    print(seventh_cluster[c].value_counts() / len(seventh_cluster) * 100)
    print("-----------------")