## Environment Setup

In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount("/content/drive")

pd.set_option('mode.chained_assignment', None)

Mounted at /content/drive


In [None]:
subsDf = pd.read_csv("/content/drive/MyDrive/FODS__FP/Dataset/subscriptions_60000.csv", low_memory=False)
transDf = pd.read_csv("/content/drive/MyDrive/FODS__FP/Dataset/transactions_600000.csv", low_memory=False)

len(subsDf.axes[0]), len(transDf.axes[0])

(56703, 510867)

## Data Cleaning

In [None]:
subsDf.isna().sum()

id                       0
created_at               0
country                  0
operator                 0
service                  0
source               10644
msisdn                   0
status                   0
cycle                    1
adnet                26938
revenue                  0
subs_date                0
renewal_date            33
freemium_end_date    35527
unsubs_from          31174
unsubs_date          23198
service_price            0
currency             12744
profile_status       17602
publisher            34339
trxid                23735
pixel                44687
handset              43752
browser              37664
attempt_charging         0
success_billing          0
dtype: int64

In [None]:
subsDf = subsDf.drop(columns=['trxid', 'pixel', 'handset', 'browser', 'id'])

In [None]:
subsDf['source'] = subsDf['source'].replace({'WAP': 'wap', 'SMS': 'sms'})

In [None]:
subsDf['unsubs_from'] = subsDf['unsubs_from'].replace({'WAP': 'wap', 'SMS': 'sms'})

In [None]:
subsDf['status'] = subsDf['status'].replace({2: 1})

In [None]:
subsDf['service'] = subsDf['service'].replace({'SLYPEE FSC': 'slypee', 'SLYPEE DFS': 'slypee', 'slypee fsc': 'slypee', 'slypee dfs': 'slypee', 'CLOUDPLAY 1': 'CLOUDPLAY'})

In [None]:
subsDf['publisher'] = subsDf['publisher'].replace({'LIG': 'lig',
                                                   'BTM': 'btm',
                                                   'STAR': 'star',
                                                   'MOV': 'mov',
                                                   'LIG': 'lig',
                                                   'VAD': 'vad',
                                                   'CAD': 'cad',
                                                   'ATN': 'atn',
                                                   'PRT': 'prt'})

subsDf['publisher'] = subsDf['publisher'].apply(lambda x: 'star' if pd.notna(x) and 'star' in x.lower() else x)
subsDf['publisher'] = subsDf['publisher'].apply(lambda x: 'lig' if pd.notna(x) and 'lig' in x.lower() else x)

In [None]:
subsDf['operator'] = subsDf['operator'].apply(lambda x: 'telkomsel' if pd.notna(x) and 'telkomsel' in x.lower() else x)

In [None]:
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: 'star' if pd.notna(x) and 'star' in x.lower() else x)
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: 'lig' if pd.notna(x) and 'lig' in x.lower() else x)
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: 'mvls' if pd.notna(x) and 'mvls' in x.lower() else x)
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: 'mobip' if pd.notna(x) and 'mobip' in x.lower() else x)
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: 'adn' if pd.notna(x) and 'adn' in x.lower() else x)
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: str(x))
subsDf['adnet'] = subsDf['adnet'].apply(lambda x: x.lower())

In [None]:
subsDf['source'] = subsDf['source'].replace({'USSD_W': 'USSD', 'USSD_D': 'USSD', 'mo_ussd': 'USSD', 'mo_sms': 'sms'})
subsDf['unsubs_from'] = subsDf['unsubs_from'].replace({'mo_ussd': 'USSD', 'mo_sms': 'sms'})

In [None]:
subsDf[['source', 'adnet', 'publisher']] = subsDf[['source', 'adnet', 'publisher']].fillna('Unknown')

In [None]:
subsDf.loc[(subsDf['country'] == 'ID') & (subsDf['currency'].isna()), 'currency'] = 'IDR'
subsDf.loc[(subsDf['country'] == 'SN') & (subsDf['currency'].isna()), 'currency'] = 'XOF'
subsDf['currency'] = subsDf['currency'].replace({'Kip': 'LAK', 'BATH': 'THB', 'BAHT': 'THB'})

In [None]:
# Indonesian Rupiah (IDR) exchange rate retrieved from Forbes Advisor (27/12/2023 13:56 UTC)

def convert_revenue(row):
    if row['currency'] == 'IDR':
        return float(row['revenue']) * 1
    elif row['currency'] == 'THB':
        return float(row['revenue']) * 447.49
    elif row['currency'] == 'Rial':
        return float(row['revenue']) * 39984.067
    elif row['currency'] == 'Baisa':
        return float(row['revenue']) * 39.984067
    elif row['currency'] == 'LAK':
        return float(row['revenue']) * 0.75
    elif row['currency'] == 'XOF':
        return float(row['revenue']) * 25.986
    elif row['currency'] == 'PHP':
        return float(row['revenue']) * 276.896

subsDf['revenue_rupiah'] = subsDf.apply(convert_revenue, axis=1)
subsDf['revenue_rupiah']

0        1000.0
1        8000.0
2           0.0
3        2000.0
4           0.0
          ...  
56698       0.0
56699       0.0
56700       0.0
56701       0.0
56702       0.0
Name: revenue_rupiah, Length: 56703, dtype: float64

In [None]:
subsDf = subsDf.drop(columns='revenue')

In [None]:
subsDf.dropna(subset=['cycle'], inplace=True)

In [None]:
subsDf['subs_date'] = subsDf['subs_date'].str.replace(r'\s+\+\d{4}', '', regex=True)
subsDf['unsubs_date'] = subsDf['unsubs_date'].str.replace(r'\s+\+\d{4}', '', regex=True)
subsDf['freemium_end_date'] = subsDf['freemium_end_date'].str.replace(r'\s+\+\d{4}', '', regex=True)

In [None]:
def convert_date(date_str):
    try:
        return pd.to_datetime(date_str, format='%d/%m/%Y %H:%M').strftime('%Y-%m-%d %H:%M:%S')
    except ValueError:
        try:
            return pd.to_datetime(date_str).strftime('%Y-%m-%d %H:%M:%S')
        except:
            return date_str

In [None]:
subsDf['subs_date'] = subsDf['subs_date'].apply(convert_date)
subsDf['unsubs_date'] = subsDf['unsubs_date'].apply(convert_date)

In [None]:
columns_to_exclude = ['created_at', 'trxid', 'id', 'msisdn', 'freemium_end_date', 'unsubs_date', 'subs_date', 'renewal_date', 'attempt_charging', 'success_billing', 'revenue', 'revenue_rupiah']
columns_to_include = [col for col in subsDf.columns if col not in columns_to_exclude and 'date' not in col.lower()]

unique_values_dict = {col: subsDf[col].unique() for col in columns_to_include}

unique_values_dict

{'country': array(['ID', 'LA', 'OM', 'PH', 'SN', 'TH'], dtype=object),
 'operator': array(['telkomsel', 'telesatpass', 'smartfren', 'ltc', 'tplus', 'etl',
        'omantel', 'ooredo', 'linguisto', 'smart', 'sen-orange-mtarget',
        'aisgemezz', 'ais'], dtype=object),
 'service': array(['slypee', 'PLAYZONE', 'CLOUDPLAY', 'GAMESPOT', 'FUN1', 'QUIZPRO',
        'GAMESC LKT', 'GAZY', 'GMSPAZE', 'GALAYS', 'gemezz', 'quizy',
        'halogame', 'goaly', 'linguisto', 'omantel', '10291', '10292',
        '9109', 'GE', '459509902', '459505105', '459505104', '459505106',
        '459509903', '459509901', 'GEE', 'GED', 'GEA', 'GE7', 'GEF',
        'GE7A', 'GEC', 'GEB', 'GE1'], dtype=object),
 'source': array(['Unknown', 'sms', 'wap', 'cp1', 'lp3', 'USSD', 'portal', 'lp6',
        'lp7', 'default', 'web'], dtype=object),
 'status': array([ 0,  1, -1]),
 'cycle': array(['daily', '2d', '3d', 'weekly', 'monthly'], dtype=object),
 'adnet': array(['nan', 'adg', 'adn', 'soy', 'prt', 'star', 'mbp', '

In [None]:
subsDf['subs_date'] = pd.to_datetime(subsDf['subs_date'], errors='coerce')
subsDf['subs_date'].min(), subsDf['subs_date'].max()

(Timestamp('2015-02-26 20:18:40'), Timestamp('2023-12-12 15:30:00'))

## Calculate CLV

In [None]:
forClv = subsDf.copy()

In [None]:
forClv['unsubs_date'] = forClv['unsubs_date'].apply(lambda x: '2023-12-12' if pd.notna(x) and '9999' in x.lower() else x)
forClv['unsubs_date'] = forClv['unsubs_date'].apply(lambda x: '2023-12-12' if pd.notna(x) and '0001' in x.lower() else x)

In [None]:
forClv['unsubs_date'] = pd.to_datetime(forClv['unsubs_date'], errors='coerce')
ongoing_subs_date = pd.Timestamp('2023-12-12')
forClv['unsubs_date'].fillna(ongoing_subs_date, inplace=True)

forClv.loc[forClv['unsubs_date'] < forClv['subs_date'], 'unsubs_date'] = ongoing_subs_date

forClv['subscription_period_days'] = (forClv['unsubs_date'] - forClv['subs_date']).dt.days

forClv['subscription_period_days'] = forClv['subscription_period_days'].replace(0, 1)

forClv = forClv.groupby('msisdn').agg({
    'subscription_period_days': 'max',
    'revenue_rupiah': 'sum',
    'country': 'first',
    'operator': lambda x: ', '.join(x.unique()),
    'service': lambda x: ', '.join(x.unique()),
    'source': 'first',
    'status': 'first',
    'cycle': 'first',
    'adnet': lambda x: ', '.join(x.dropna().unique()),
    'subs_date': 'first',
    'unsubs_from': 'first',
    'unsubs_date': 'first',
    'service_price': 'first',
    'currency': 'first',
    'publisher': lambda x: ', '.join(x.dropna().unique()),
    'attempt_charging': 'sum',
    'success_billing': 'sum'
}).reset_index()

# Recalculate CLV
forClv['CLV'] = forClv['revenue_rupiah'] / forClv['subscription_period_days']

forClv['revenue_rupiah'].value_counts()

0.000000e+00    34861
1.000000e+03     5856
3.897900e+03     1061
2.000000e+03      787
1.199522e+07      669
                ...  
5.517801e+05        1
1.259498e+06        1
3.130752e+06        1
1.115555e+06        1
2.800000e+04        1
Name: revenue_rupiah, Length: 660, dtype: int64

In [None]:
forClv['CLV'] = forClv['CLV'].replace({-2000: 2000})

In [None]:
conversion_rate = 0.000064
forClv['revenue_usd'] = forClv['revenue_rupiah'] * conversion_rate

forClv[['revenue_rupiah', 'revenue_usd']].head()

Unnamed: 0,revenue_rupiah,revenue_usd
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0


In [None]:
clv_min = forClv['CLV'].min()
clv_max = forClv['CLV'].max()

def segment_clv(clv):
    if clv_min <= clv <= 1000:
        return 'Very Low'
    elif 1001 <= clv <= 10000:
        return 'Low'
    elif 10001 <= clv <= 100000:
        return 'Medium'
    elif 100001 <= clv <= clv_max:
        return 'High'

forClv['Segment'] = forClv['CLV'].apply(segment_clv)

forClv[['CLV', 'Segment']].head()

Unnamed: 0,CLV,Segment
0,0.0,Very Low
1,0.0,Very Low
2,0.0,Very Low
3,0.0,Very Low
4,0.0,Very Low


In [None]:
forClv.operator.value_counts()

smart                     10000
omantel                    8752
telkomsel                  7935
sen-orange-mtarget         6703
ais                        6215
tplus                      3719
aisgemezz                  3697
etl                        3233
ltc                        3031
telesatpass                1175
smartfren                   881
ooredo                      754
linguisto                   328
linguisto, omantel           28
omantel, linguisto           28
aisgemezz, ais               14
ais, aisgemezz                6
telkomsel, telesatpass        1
Name: operator, dtype: int64

In [None]:
forClv.isna().sum()

msisdn                          0
subscription_period_days        0
revenue_rupiah                  0
country                         0
operator                        0
service                         0
source                          0
status                          0
cycle                           0
adnet                           0
subs_date                       0
unsubs_from                 30984
unsubs_date                     0
service_price                   0
currency                        0
publisher                       0
attempt_charging                0
success_billing                 0
CLV                             0
revenue_usd                     0
Segment                         0
dtype: int64

In [None]:
very_low_df = forClv[forClv['Segment'] == 'Very Low']
low_df = forClv[forClv['Segment'] == 'Low']
medium_df = forClv[forClv['Segment'] == 'Medium']
high_df = forClv[forClv['Segment'] == 'High']
very_high_df = forClv[forClv['Segment'] == 'Very High']

## Very Low

In [None]:
very_low_df

Unnamed: 0,msisdn,subscription_period_days,revenue_rupiah,country,operator,service,source,status,cycle,adnet,...,unsubs_from,unsubs_date,service_price,currency,publisher,attempt_charging,success_billing,CLV,revenue_usd,Segment
0,2020008781,16,0.0,LA,etl,slypee,lp3,-1,daily,lig,...,cstool,2023-11-13 14:19:01,2000,LAK,lumos,16,0,0.00,0.000,Very Low
1,2020010903,17,0.0,LA,etl,slypee,lp3,-1,daily,lig,...,cstool,2023-11-13 14:19:01,2000,LAK,lumos,18,0,0.00,0.000,Very Low
2,2020023548,16,0.0,LA,etl,slypee,lp3,-1,daily,lig,...,cstool,2023-11-13 14:19:01,2000,LAK,lumos,16,0,0.00,0.000,Very Low
3,2020025070,16,0.0,LA,etl,slypee,lp3,-1,daily,lig,...,cstool,2023-11-13 14:19:01,2000,LAK,lumos,16,0,0.00,0.000,Very Low
4,2020031236,17,0.0,LA,etl,slypee,lp3,-1,daily,lig,...,cstool,2023-11-13 14:19:01,2000,LAK,lumos,18,0,0.00,0.000,Very Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56495,[MCV]zazcmbTB61O1uSXu6qR3JA==,32,1000.0,ID,smartfren,FUN1,Unknown,0,daily,,...,sms,2023-12-12 00:00:00,1000,IDR,Unknown,1,1,31.25,0.064,Very Low
56496,[MCV]zd9woj036A8m2AbWYkIa7g==,32,1000.0,ID,smartfren,FUN1,Unknown,0,daily,,...,sms,2023-12-12 00:00:00,1000,IDR,Unknown,1,1,31.25,0.064,Very Low
56497,[MCV]zhEIzueBLiPjg/QAgzfvGw==,32,1000.0,ID,smartfren,FUN1,Unknown,0,daily,,...,sms,2023-12-12 00:00:00,1000,IDR,Unknown,2,1,31.25,0.064,Very Low
56498,[MCV]zj7dxevMx0XVBHNPkl1jtA==,32,1000.0,ID,smartfren,FUN1,Unknown,0,daily,,...,sms,2023-12-12 00:00:00,1000,IDR,Unknown,2,1,31.25,0.064,Very Low


In [None]:
very_low_expanded = very_low_df.assign(service=very_low_df['service'].str.split(', ')).explode('service')

user_service_matrix = pd.pivot_table(very_low_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

user_service_matrix.shape, user_service_matrix.head()

((51601, 35),
 service     10291  10292  459505104  459505105  459505106  459509901  \
 msisdn                                                                 
 2020008781      0      0          0          0          0          0   
 2020010903      0      0          0          0          0          0   
 2020023548      0      0          0          0          0          0   
 2020025070      0      0          0          0          0          0   
 2020031236      0      0          0          0          0          0   
 
 service     459509902  459509903  9109  CLOUDPLAY  ...  GMSPAZE  PLAYZONE  \
 msisdn                                             ...                      
 2020008781          0          0     0          0  ...        0         0   
 2020010903          0          0     0          0  ...        0         0   
 2020023548          0          0     0          0  ...        0         0   
 2020025070          0          0     0          0  ...        0         0   
 2020

### COSINE SIMILARITY

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(user_service_matrix.T)

cosine_sim_df = pd.DataFrame(cosine_sim, index=user_service_matrix.columns, columns=user_service_matrix.columns)

cosine_sim_df.head()

service,10291,10292,459505104,459505105,459505106,459509901,459509902,459509903,9109,CLOUDPLAY,...,GMSPAZE,PLAYZONE,QUIZPRO,gemezz,goaly,halogame,linguisto,omantel,quizy,slypee
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10291,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505104,0.0,0.0,1.0,0.003453,0.002455,0.00494,0.005959,0.003673,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505105,0.0,0.0,0.003453,1.0,0.000994,0.0012,0.000905,0.00223,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505106,0.0,0.0,0.002455,0.000994,1.0,0.004551,0.004289,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def recommend_services_cosine(user_id, user_service_matrix, similarity_matrix, top_n=2):
    subscribed_services = user_service_matrix.loc[user_id]
    subscribed_services = subscribed_services[subscribed_services > 0].index.tolist()

    service_similarities = similarity_matrix[subscribed_services].mean(axis=1)

    for service in subscribed_services:
        service_similarities.drop(service, inplace=True, errors='ignore')

    recommended_services = service_similarities.nlargest(top_n).index.tolist()

    return recommended_services

sample_users = very_low_df['msisdn'].sample(n=10, random_state=1)

recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_cosine(user, user_service_matrix, cosine_sim_df)
    recommendations[user] = recommended_services

recommendations

{'2057442966': ['gemezz', 'halogame'],
 '66987584335': ['459509901', '459505104'],
 '2077009471': ['halogame', 'quizy'],
 '639818130586': ['10291', '459505104'],
 'PDKSUB-200-0+rqhyv7vOJwz71gaMRwWeTvt1yMsSoEAhRnktUOQ8E=': ['halogame',
  'quizy'],
 '639486902000': ['10291', '459505104'],
 '639517372642': ['10292', '459505104'],
 '[MCV]C2VUe+nazqujIxqltV8qmg==': ['10291', '10292'],
 '96894332483': ['halogame', 'quizy'],
 '639186884041': ['10292', '459505104']}

In [None]:
very_low_cosine = very_low_df.copy()

very_low_cosine['Recommended_Services'] = very_low_cosine['msisdn'].apply(lambda x: recommend_services_cosine(x, user_service_matrix, cosine_sim_df))

very_low_cosine[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
0,2020008781,slypee,"[10291, 10292]"
1,2020010903,slypee,"[10291, 10292]"
2,2020023548,slypee,"[10291, 10292]"
3,2020025070,slypee,"[10291, 10292]"
4,2020031236,slypee,"[10291, 10292]"


In [None]:
very_low_cosine['Recommended_Services'].value_counts()

[halogame, quizy]         11603
[10291, 10292]             9952
[10292, 459505104]         8021
[linguisto, omantel]       4986
[459509902, 459505106]     3182
[QUIZPRO, 10291]           2632
[10291, 459505104]         1949
[459509902, 459509901]     1398
[459509901, 459505104]     1257
[gemezz, halogame]         1076
[459509901, 459509902]     1044
[459505104, 459509903]      942
[459505104, 459509902]      837
[459509902, 459505104]      718
[GAMESPOT, CLOUDPLAY]       671
[omantel, goaly]            658
[gemezz, quizy]             319
[linguisto, goaly]          260
[omantel, 10291]             37
[459505104, 459509901]       11
[goaly, 10291]               11
[459509901, 459505106]        8
[linguisto, 10291]            8
[halogame, 10291]             7
[459505104, 459505106]        6
[quizy, 10291]                5
[gemezz, 10291]               1
[CLOUDPLAY, 10291]            1
[GAMESPOT, 10291]             1
Name: Recommended_Services, dtype: int64

### Very Low Cosine Metrics Test

In [None]:
very_low_cosine_test = very_low_cosine[very_low_cosine['service'].str.contains(',')]

In [None]:
import random

very_low_cosine_test['actual_services'] = very_low_cosine_test['service']

very_low_cosine_test['service'] = very_low_cosine_test['service'].apply(lambda x: random.choice(x.split(', ')))

very_low_cosine_test[['msisdn', 'service', 'actual_services']]

Unnamed: 0,msisdn,service,actual_services
3406,2052241312,gemezz,"gemezz, quizy"
3437,2052308156,gemezz,"gemezz, quizy"
3499,2052474283,quizy,"gemezz, quizy"
3639,2052824903,gemezz,"quizy, gemezz"
3738,2054110171,quizy,"gemezz, quizy"
...,...,...,...
38837,96899369161,goaly,"linguisto, goaly"
39002,96899491483,omantel,"omantel, goaly"
39193,96899583406,goaly,"goaly, omantel, linguisto"
47283,SHDC-3QlPeGo6p0A22taJsTa4W3FoibkmgW9IgpEsS3ThPz4=,QUIZPRO,"QUIZPRO, GAMESPOT"


In [None]:
filtered_df = very_low_cosine_test[very_low_cosine_test['actual_services'].str.count(',') == 2]

result_df = filtered_df[['msisdn', 'service', 'actual_services']]

In [None]:
very_low_cosine_test[['msisdn', 'service', 'actual_services']]

Unnamed: 0,msisdn,service,actual_services
3406,2052241312,gemezz,"gemezz, quizy"
3437,2052308156,gemezz,"gemezz, quizy"
3499,2052474283,quizy,"gemezz, quizy"
3639,2052824903,gemezz,"quizy, gemezz"
3738,2054110171,quizy,"gemezz, quizy"
...,...,...,...
38837,96899369161,goaly,"linguisto, goaly"
39002,96899491483,omantel,"omantel, goaly"
39193,96899583406,goaly,"goaly, omantel, linguisto"
47283,SHDC-3QlPeGo6p0A22taJsTa4W3FoibkmgW9IgpEsS3ThPz4=,QUIZPRO,"QUIZPRO, GAMESPOT"


In [None]:
very_low_cosine_test_matrix = pd.pivot_table(very_low_cosine_test, index='msisdn', columns='service', aggfunc='size', fill_value=0)

very_low_cosine_test['Recommended_Services'] = very_low_cosine_test['msisdn'].apply(lambda x: recommend_services_cosine(x, very_low_cosine_test_matrix, cosine_sim_df))

very_low_cosine_test[['msisdn', 'service', 'actual_services', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,actual_services,Recommended_Services
3406,2052241312,gemezz,"gemezz, quizy","[halogame, quizy]"
3437,2052308156,gemezz,"gemezz, quizy","[halogame, quizy]"
3499,2052474283,quizy,"gemezz, quizy","[gemezz, halogame]"
3639,2052824903,gemezz,"quizy, gemezz","[halogame, quizy]"
3738,2054110171,quizy,"gemezz, quizy","[gemezz, halogame]"


In [None]:
very_low_cosine_test['Recommended_Services'] = very_low_cosine_test['Recommended_Services'].astype(str).str.replace("[\[\]' ]", '', regex=True).str.split(',')
very_low_cosine_test['actual_services'] = very_low_cosine_test['actual_services'].astype(str).str.replace(' ', '').str.split(',')
very_low_cosine_test['service'] = very_low_cosine_test['service'].astype(str).str.split(',')

In [None]:
def calculate_precision(recommended, actual):
    recommended_set = set(recommended)
    actual_set = set(actual)
    precision = 1 if len(recommended_set.intersection(actual_set)) > 0 else 0
    return precision

def calculate_recall(recommended, actual, service):
    relevant_actual_services = set(actual) - set(service)

    if not relevant_actual_services:
        return None

    recommended_set = set(recommended)
    correct_recommendations = recommended_set.intersection(relevant_actual_services)
    recall = len(correct_recommendations) / len(relevant_actual_services)

    return recall

In [None]:
very_low_cosine_test['precision'] = very_low_cosine_test.apply(lambda row: calculate_precision(row['Recommended_Services'], row['actual_services']), axis=1)
very_low_cosine_test['recall'] = very_low_cosine_test.apply(lambda row: calculate_recall(row['Recommended_Services'], row['actual_services'], row['service']), axis=1)

average_cosine_precision = very_low_cosine_test['precision'].mean()
average_cosine_recall = very_low_cosine_test['recall'].mean()

average_cosine_precision, average_cosine_recall

(0.7482014388489209, 0.7446043165467626)

In [None]:
very_low_cosine_test['target_services'] = very_low_cosine_test.apply(lambda row: [service for service in row['actual_services'] if service not in row['service']], axis=1)

# Calculating True Positive (TP), False Positive (FP), False Negative (FN), and True Negative (TN)
def calculate_metrics(row):
    tp = len(set(row['target_services']) & set(row['Recommended_Services']))
    fp = 3 - tp
    fn = len(row['actual_services']) - tp
    tn = 34 - (tp + fp + fn)  # Total services (34) minus (TP + FP + FN)
    accuracy = (tp + tn) / 34
    return pd.Series([tp, fp, fn, tn, accuracy])

very_low_cosine_test[['TP', 'FP', 'FN', 'TN', 'Accuracy']] = very_low_cosine_test.apply(calculate_metrics, axis=1)

average_accuracy = very_low_cosine_test['Accuracy'].mean()

average_accuracy

0.8969530258146424

### JACCARD

In [None]:
from sklearn.metrics import pairwise_distances

very_low_expanded = very_low_df.assign(service=very_low_df['service'].str.split(', ')).explode('service')
user_service_matrix = pd.pivot_table(very_low_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

jaccard_sim = 1 - pairwise_distances(user_service_matrix.T.values, metric="jaccard")

jaccard_sim_df = pd.DataFrame(jaccard_sim, index=user_service_matrix.columns, columns=user_service_matrix.columns)

jaccard_sim_df.head()



service,10291,10292,459505104,459505105,459505106,459509901,459509902,459509903,9109,CLOUDPLAY,...,GMSPAZE,PLAYZONE,QUIZPRO,gemezz,goaly,halogame,linguisto,omantel,quizy,slypee
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10291,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505104,0.0,0.0,1.0,0.001697,0.001217,0.002347,0.002985,0.001781,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505105,0.0,0.0,0.001697,1.0,0.000497,0.000595,0.000448,0.001114,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505106,0.0,0.0,0.001217,0.000497,1.0,0.002241,0.002139,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def recommend_services_jaccard(user_id, user_service_matrix, similarity_matrix, top_n=2):
    subscribed_services = user_service_matrix.loc[user_id]
    subscribed_services = subscribed_services[subscribed_services > 0].index.tolist()

    service_similarities = similarity_matrix[subscribed_services].mean(axis=1)

    for service in subscribed_services:
        service_similarities.drop(service, inplace=True, errors='ignore')

    recommended_services = service_similarities.nlargest(top_n).index.tolist()

    return recommended_services

sample_users = very_low_df['msisdn'].sample(n=10, random_state=1)

recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_jaccard(user, user_service_matrix, jaccard_sim_df)
    recommendations[user] = recommended_services

recommendations

{'2057442966': ['halogame', 'gemezz'],
 '66987584335': ['459509901', '459505104'],
 '2077009471': ['quizy', 'halogame'],
 '639818130586': ['10291', '459505104'],
 'PDKSUB-200-0+rqhyv7vOJwz71gaMRwWeTvt1yMsSoEAhRnktUOQ8E=': ['quizy',
  'halogame'],
 '639486902000': ['10291', '459505104'],
 '639517372642': ['10292', '459505104'],
 '[MCV]C2VUe+nazqujIxqltV8qmg==': ['10291', '10292'],
 '96894332483': ['quizy', 'halogame'],
 '639186884041': ['10292', '459505104']}

In [None]:
very_low_jaccard = very_low_df.copy()

very_low_jaccard['Recommended_Services'] = very_low_jaccard['msisdn'].apply(lambda x: recommend_services_jaccard(x, user_service_matrix, jaccard_sim_df))

very_low_jaccard[['msisdn', 'service', 'Recommended_Services']]

Unnamed: 0,msisdn,service,Recommended_Services
0,2020008781,slypee,"[10291, 10292]"
1,2020010903,slypee,"[10291, 10292]"
2,2020023548,slypee,"[10291, 10292]"
3,2020025070,slypee,"[10291, 10292]"
4,2020031236,slypee,"[10291, 10292]"
...,...,...,...
56495,[MCV]zazcmbTB61O1uSXu6qR3JA==,FUN1,"[10291, 10292]"
56496,[MCV]zd9woj036A8m2AbWYkIa7g==,FUN1,"[10291, 10292]"
56497,[MCV]zhEIzueBLiPjg/QAgzfvGw==,FUN1,"[10291, 10292]"
56498,[MCV]zj7dxevMx0XVBHNPkl1jtA==,FUN1,"[10291, 10292]"


In [None]:
very_low_jaccard['Recommended_Services'].value_counts()

[quizy, halogame]         11603
[10291, 10292]             9952
[10292, 459505104]         8021
[linguisto, omantel]       4986
[459509902, 459505106]     3182
[QUIZPRO, 10291]           2632
[10291, 459505104]         1949
[459509902, 459509901]     1398
[459509901, 459505104]     1257
[halogame, gemezz]         1076
[459509901, 459509902]     1044
[459505104, 459509903]      942
[459505104, 459509902]      835
[459509902, 459505104]      718
[GAMESPOT, CLOUDPLAY]       671
[omantel, goaly]            658
[quizy, gemezz]             319
[linguisto, goaly]          260
[omantel, 10291]             37
[459505104, 459509901]       13
[goaly, 10291]               11
[459509901, 459505106]        8
[linguisto, 10291]            8
[halogame, 10291]             7
[459505104, 459505106]        6
[quizy, 10291]                5
[gemezz, 10291]               1
[CLOUDPLAY, 10291]            1
[GAMESPOT, 10291]             1
Name: Recommended_Services, dtype: int64

### Very Low Jaccard Metrics Test

In [None]:
very_low_jaccard_test = very_low_jaccard[very_low_jaccard['service'].str.contains(',')]

In [None]:
very_low_jaccard_test['actual_services'] = very_low_jaccard_test['service']

very_low_jaccard_test['service'] = very_low_jaccard_test['service'].apply(lambda x: random.choice(x.split(', ')))

very_low_jaccard_test[['msisdn', 'service', 'actual_services']]

Unnamed: 0,msisdn,service,actual_services
3406,2052241312,gemezz,"gemezz, quizy"
3437,2052308156,quizy,"gemezz, quizy"
3499,2052474283,quizy,"gemezz, quizy"
3639,2052824903,gemezz,"quizy, gemezz"
3738,2054110171,gemezz,"gemezz, quizy"
...,...,...,...
38837,96899369161,linguisto,"linguisto, goaly"
39002,96899491483,omantel,"omantel, goaly"
39193,96899583406,goaly,"goaly, omantel, linguisto"
47283,SHDC-3QlPeGo6p0A22taJsTa4W3FoibkmgW9IgpEsS3ThPz4=,GAMESPOT,"QUIZPRO, GAMESPOT"


In [None]:
very_low_jaccard_test_matrix = pd.pivot_table(very_low_jaccard_test, index='msisdn', columns='service', aggfunc='size', fill_value=0)

very_low_jaccard_test['Recommended_Services'] = very_low_jaccard_test['msisdn'].apply(lambda x: recommend_services_jaccard(x, very_low_jaccard_test_matrix, jaccard_sim_df))

very_low_jaccard_test[['msisdn', 'service', 'actual_services', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,actual_services,Recommended_Services
3406,2052241312,gemezz,"gemezz, quizy","[quizy, halogame]"
3437,2052308156,quizy,"gemezz, quizy","[halogame, gemezz]"
3499,2052474283,quizy,"gemezz, quizy","[halogame, gemezz]"
3639,2052824903,gemezz,"quizy, gemezz","[quizy, halogame]"
3738,2054110171,gemezz,"gemezz, quizy","[quizy, halogame]"


In [None]:
very_low_jaccard_test['Recommended_Services'] = very_low_jaccard_test['Recommended_Services'].astype(str).str.replace("[\[\]' ]", '', regex=True).str.split(',')
very_low_jaccard_test['actual_services'] = very_low_jaccard_test['actual_services'].astype(str).str.replace(' ', '').str.split(',')
very_low_jaccard_test['service'] = very_low_jaccard_test['service'].astype(str).str.split(',')

In [None]:
def calculate_precision(recommended, actual):
    recommended_set = set(recommended)
    actual_set = set(actual)
    precision = 1 if len(recommended_set.intersection(actual_set)) > 0 else 0
    return precision

def calculate_recall(recommended, actual, service):
    relevant_actual_services = set(actual) - set(service)

    if not relevant_actual_services:
        return None

    recommended_set = set(recommended)
    correct_recommendations = recommended_set.intersection(relevant_actual_services)
    recall = len(correct_recommendations) / len(relevant_actual_services)

    return recall

In [None]:
very_low_jaccard_test['precision'] = very_low_jaccard_test.apply(lambda row: calculate_precision(row['Recommended_Services'], row['actual_services']), axis=1)
very_low_jaccard_test['recall'] = very_low_jaccard_test.apply(lambda row: calculate_recall(row['Recommended_Services'], row['actual_services'], row['service']), axis=1)

average_jaccard_precision = very_low_jaccard_test['precision'].mean()
average_jaccard_recall = very_low_jaccard_test['recall'].mean()

average_jaccard_precision, average_jaccard_recall

(0.7913669064748201, 0.7877697841726619)

In [None]:
very_low_jaccard_test['target_services'] = very_low_jaccard_test.apply(lambda row: [service for service in row['actual_services'] if service not in row['service']], axis=1)

def calculate_metrics(row):
    tp = len(set(row['target_services']) & set(row['Recommended_Services']))
    fp = 2 - tp
    fn = len(row['actual_services']) - tp
    tn = 34 - (tp + fp + fn)
    accuracy = (tp + tn) / 34
    return pd.Series([tp, fp, fn, tn, accuracy])

very_low_jaccard_test[['TP', 'FP', 'FN', 'TN', 'Accuracy']] = very_low_jaccard_test.apply(calculate_metrics, axis=1)

average_accuracy = very_low_jaccard_test['Accuracy'].mean()

average_accuracy

0.9289039356749895

### EUCLIDEAN

In [None]:
very_low_expanded = very_low_df.assign(service=very_low_df['service'].str.split(', ')).explode('service')
user_service_matrix = pd.pivot_table(very_low_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

In [None]:
from scipy.spatial.distance import euclidean

euclidean_dist_matrix = pairwise_distances(user_service_matrix.T, metric='euclidean')

euclidean_similarity = 1 / (1 + euclidean_dist_matrix)

euclidean_sim_df = pd.DataFrame(euclidean_similarity, index=user_service_matrix.columns, columns=user_service_matrix.columns)

euclidean_sim_df.head()

service,10291,10292,459505104,459505105,459505106,459509901,459509902,459509903,9109,CLOUDPLAY,...,GMSPAZE,PLAYZONE,QUIZPRO,gemezz,goaly,halogame,linguisto,omantel,quizy,slypee
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10291,1.0,0.009916,0.010194,0.010445,0.010384,0.010578,0.010262,0.01051,0.011039,0.009896,...,0.011039,0.010715,0.010611,0.007086,0.008677,0.010828,0.010591,0.010857,0.010371,0.007929
10292,0.009916,1.0,0.016969,0.018222,0.017901,0.018961,0.017291,0.018574,0.022122,0.015683,...,0.022122,0.019795,0.019155,0.008513,0.011827,0.02054,0.019034,0.020742,0.017834,0.010113
459505104,0.010194,0.016969,1.0,0.020199,0.019756,0.021232,0.018978,0.020683,0.025922,0.016868,...,0.025922,0.02237,0.021456,0.008687,0.012309,0.023465,0.021286,0.023768,0.019643,0.010409
459505105,0.010445,0.018222,0.020199,1.0,0.021802,0.023816,0.020723,0.023071,0.031282,0.018097,...,0.031282,0.025529,0.024192,0.008841,0.01276,0.027201,0.023949,0.027679,0.021671,0.010677
459505106,0.010384,0.017901,0.019756,0.021802,1.0,0.023147,0.020288,0.022404,0.029718,0.017782,...,0.029718,0.024662,0.023451,0.008804,0.012649,0.026158,0.02323,0.026582,0.021135,0.010612


In [None]:
def recommend_services_euclidean(user_id, user_service_matrix, similarity_matrix, top_n=2):
    subscribed_services = user_service_matrix.loc[user_id]
    subscribed_services = subscribed_services[subscribed_services > 0].index.tolist()

    service_similarities = similarity_matrix[subscribed_services].mean(axis=1)

    for service in subscribed_services:
        service_similarities.drop(service, inplace=True, errors='ignore')

    recommended_services = service_similarities.nlargest(top_n).index.tolist()

    return recommended_services

sample_users = very_low_df['msisdn'].sample(n=10, random_state=1)

euclidean_recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_euclidean(user, user_service_matrix, euclidean_sim_df)
    euclidean_recommendations[user] = recommended_services

euclidean_recommendations

{'2057442966': ['GALAYS', 'GE1'],
 '66987584335': ['GALAYS', 'GE1'],
 '2077009471': ['GALAYS', 'GE1'],
 '639818130586': ['GALAYS', 'GE1'],
 'PDKSUB-200-0+rqhyv7vOJwz71gaMRwWeTvt1yMsSoEAhRnktUOQ8E=': ['GALAYS', 'GE1'],
 '639486902000': ['GALAYS', 'GE1'],
 '639517372642': ['GALAYS', 'GE1'],
 '[MCV]C2VUe+nazqujIxqltV8qmg==': ['GALAYS', 'GE1'],
 '96894332483': ['GALAYS', 'GE1'],
 '639186884041': ['GALAYS', 'GE1']}

In [None]:
very_low_euclidean = very_low_df.copy()

very_low_euclidean['Recommended_Services'] = very_low_euclidean['msisdn'].apply(
    lambda x: recommend_services_euclidean(x, user_service_matrix, euclidean_sim_df))

very_low_euclidean[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
0,2020008781,slypee,"[GALAYS, GE1]"
1,2020010903,slypee,"[GALAYS, GE1]"
2,2020023548,slypee,"[GALAYS, GE1]"
3,2020025070,slypee,"[GALAYS, GE1]"
4,2020031236,slypee,"[GALAYS, GE1]"


In [None]:
very_low_euclidean['Recommended_Services'].value_counts()

[GALAYS, GE1]     51599
[GALAYS, 9109]        1
[GE1, 9109]           1
Name: Recommended_Services, dtype: int64

### Very Low Euclidean Metrics Test

In [None]:
very_low_euclidean_test = very_low_euclidean[very_low_euclidean['service'].str.contains(',')]

In [None]:
very_low_euclidean_test['actual_services'] = very_low_euclidean_test['service']

very_low_euclidean_test['service'] = very_low_euclidean_test['service'].apply(lambda x: random.choice(x.split(', ')))

very_low_euclidean_test[['msisdn', 'service', 'actual_services']]

Unnamed: 0,msisdn,service,actual_services
3406,2052241312,quizy,"gemezz, quizy"
3437,2052308156,gemezz,"gemezz, quizy"
3499,2052474283,gemezz,"gemezz, quizy"
3639,2052824903,quizy,"quizy, gemezz"
3738,2054110171,gemezz,"gemezz, quizy"
...,...,...,...
38837,96899369161,goaly,"linguisto, goaly"
39002,96899491483,goaly,"omantel, goaly"
39193,96899583406,linguisto,"goaly, omantel, linguisto"
47283,SHDC-3QlPeGo6p0A22taJsTa4W3FoibkmgW9IgpEsS3ThPz4=,GAMESPOT,"QUIZPRO, GAMESPOT"


In [None]:
very_low_euclidean_test_matrix = pd.pivot_table(very_low_euclidean_test, index='msisdn', columns='service', aggfunc='size', fill_value=0)

very_low_euclidean_test['Recommended_Services'] = very_low_euclidean_test['msisdn'].apply(lambda x: recommend_services_euclidean(x, very_low_euclidean_test_matrix, euclidean_sim_df))

very_low_euclidean_test[['msisdn', 'service', 'actual_services', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,actual_services,Recommended_Services
3406,2052241312,quizy,"gemezz, quizy","[GALAYS, GE1]"
3437,2052308156,gemezz,"gemezz, quizy","[GALAYS, GE1]"
3499,2052474283,gemezz,"gemezz, quizy","[GALAYS, GE1]"
3639,2052824903,quizy,"quizy, gemezz","[GALAYS, GE1]"
3738,2054110171,gemezz,"gemezz, quizy","[GALAYS, GE1]"


In [None]:
very_low_euclidean_test['Recommended_Services'] = very_low_euclidean_test['Recommended_Services'].astype(str).str.replace("[\[\]' ]", '', regex=True).str.split(',')
very_low_euclidean_test['actual_services'] = very_low_euclidean_test['actual_services'].astype(str).str.replace(' ', '').str.split(',')
very_low_euclidean_test['service'] = very_low_euclidean_test['service'].astype(str).str.split(',')

In [None]:
very_low_euclidean_test['precision'] = very_low_euclidean_test.apply(lambda row: calculate_precision(row['Recommended_Services'], row['actual_services']), axis=1)
very_low_euclidean_test['recall'] = very_low_euclidean_test.apply(lambda row: calculate_recall(row['Recommended_Services'], row['actual_services'], row['service']), axis=1)

average_euclidean_precision = very_low_euclidean_test['precision'].mean()
average_euclidean_recall = very_low_euclidean_test['recall'].mean()

average_euclidean_precision, average_euclidean_recall

(0.0, 0.0)

In [None]:
very_low_euclidean_test['target_services'] = very_low_euclidean_test.apply(lambda row: [service for service in row['actual_services'] if service not in row['service']], axis=1)

def calculate_metrics(row):
    tp = len(set(row['target_services']) & set(row['Recommended_Services']))
    fp = 2 - tp
    fn = len(row['actual_services']) - tp
    tn = 34 - (tp + fp + fn)
    accuracy = (tp + tn) / 34
    return pd.Series([tp, fp, fn, tn, accuracy])

very_low_euclidean_test[['TP', 'FP', 'FN', 'TN', 'Accuracy']] = very_low_euclidean_test.apply(calculate_metrics, axis=1)

average_accuracy = very_low_euclidean_test['Accuracy'].mean()

average_accuracy

0.8819297503173933

### MANHATTAN / CITY BLOCK

In [None]:
very_low_expanded = very_low_df.assign(service=very_low_df['service'].str.split(', ')).explode('service')
user_service_matrix = pd.pivot_table(very_low_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

In [None]:
manhattan_dist_matrix = pairwise_distances(user_service_matrix.T, metric='manhattan')

manhattan_similarity = 1 / (1 + manhattan_dist_matrix)

manhattan_sim_df = pd.DataFrame(manhattan_similarity, index=user_service_matrix.columns, columns=user_service_matrix.columns)

In [None]:
def recommend_services_manhattan(user_id, user_service_matrix, similarity_matrix, top_n=3):
    if user_id not in user_service_matrix.index:
        return []

    subscribed_services = user_service_matrix.loc[user_id]
    subscribed_services = subscribed_services[subscribed_services > 0].index.tolist()

    service_similarities = similarity_matrix[subscribed_services].mean(axis=1)

    for service in subscribed_services:
        service_similarities.drop(service, inplace=True, errors='ignore')

    recommended_services = service_similarities.nlargest(top_n).index.tolist()

    return recommended_services

manhattan_sim_df.head()

service,10291,10292,459505104,459505105,459505106,459509901,459509902,459509903,9109,CLOUDPLAY,...,GMSPAZE,PLAYZONE,QUIZPRO,gemezz,goaly,halogame,linguisto,omantel,quizy,slypee
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10291,1.0,0.0001,0.000106,0.000111,0.00011,0.000114,0.000107,0.000113,0.000125,0.0001,...,0.000125,0.000117,0.000115,5.1e-05,7.7e-05,0.00012,0.000115,0.00012,0.00011,6.4e-05
10292,0.0001,1.0,0.000298,0.000344,0.000332,0.000373,0.00031,0.000358,0.000512,0.000254,...,0.000512,0.000408,0.000381,7.4e-05,0.000143,0.00044,0.000376,0.000448,0.00033,0.000104
459505104,0.000106,0.000298,1.0,0.000425,0.000406,0.00047,0.000374,0.000446,0.000708,0.000294,...,0.000708,0.000523,0.000481,7.7e-05,0.000155,0.000577,0.000473,0.000592,0.000401,0.000111
459505105,0.000111,0.000344,0.000425,1.0,0.000497,0.000595,0.000448,0.000557,0.001042,0.00034,...,0.001042,0.000686,0.000614,8e-05,0.000167,0.000781,0.000602,0.00081,0.00049,0.000116
459505106,0.00011,0.000332,0.000406,0.000497,1.0,0.000561,0.000429,0.000525,0.000937,0.000328,...,0.000937,0.000639,0.000576,7.9e-05,0.000164,0.000721,0.000565,0.000745,0.000466,0.000115


In [None]:
sample_users_manhattan = very_low_df['msisdn'].sample(n=10, random_state=1)

manhattan_recommendations = {}
for user in sample_users_manhattan:
    recommended_services = recommend_services_manhattan(user, user_service_matrix, manhattan_sim_df)
    manhattan_recommendations[user] = recommended_services

manhattan_recommendations

{'2057442966': ['GALAYS', 'GE1', '9109'],
 '66987584335': ['GALAYS', 'GE1', '9109'],
 '2077009471': ['GALAYS', 'GE1', '9109'],
 '639818130586': ['GALAYS', 'GE1', '9109'],
 'PDKSUB-200-0+rqhyv7vOJwz71gaMRwWeTvt1yMsSoEAhRnktUOQ8E=': ['GALAYS',
  'GE1',
  '9109'],
 '639486902000': ['GALAYS', 'GE1', '9109'],
 '639517372642': ['GALAYS', 'GE1', '9109'],
 '[MCV]C2VUe+nazqujIxqltV8qmg==': ['GALAYS', 'GE1', '9109'],
 '96894332483': ['GALAYS', 'GE1', '9109'],
 '639186884041': ['GALAYS', 'GE1', '9109']}

In [None]:
manhattan_recommendations.values()

dict_values([['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109'], ['GALAYS', 'GE1', '9109']])

In [None]:
very_low_manhattan = very_low_df.copy()

very_low_manhattan['Recommended_Services'] = very_low_manhattan['msisdn'].apply(
    lambda x: recommend_services_manhattan(x, user_service_matrix, manhattan_sim_df))

very_low_manhattan[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
0,2020008781,slypee,"[GALAYS, GE1, 9109]"
1,2020010903,slypee,"[GALAYS, GE1, 9109]"
2,2020023548,slypee,"[GALAYS, GE1, 9109]"
3,2020025070,slypee,"[GALAYS, GE1, 9109]"
4,2020031236,slypee,"[GALAYS, GE1, 9109]"


## Low

In [None]:
low_expanded = low_df.assign(service=low_df['service'].str.split(', ')).explode('service')

user_service_matrix = pd.pivot_table(low_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

user_service_matrix.shape, user_service_matrix.head()

((2206, 21),
 service     10291  10292  459505104  459505105  459505106  459509902  \
 msisdn                                                                 
 2022000108      0      0          0          0          0          0   
 2022003350      0      0          0          0          0          0   
 2022007595      0      0          0          0          0          0   
 2022007781      0      0          0          0          0          0   
 2022047204      0      0          0          0          0          0   
 
 service     459509903  CLOUDPLAY  GAMESPOT  GE  ...  GEF  PLAYZONE  QUIZPRO  \
 msisdn                                          ...                           
 2022000108          0          0         0   0  ...    0         0        0   
 2022003350          0          0         0   0  ...    0         0        0   
 2022007595          0          0         0   0  ...    0         0        0   
 2022007781          0          0         0   0  ...    0         0       

In [None]:
cosine_sim = cosine_similarity(user_service_matrix.T)

cosine_sim_df = pd.DataFrame(cosine_sim, index=user_service_matrix.columns, columns=user_service_matrix.columns)

cosine_sim_df.iloc[:10, :10]

service,10291,10292,459505104,459505105,459505106,459509902,459509903,CLOUDPLAY,GAMESPOT,GE
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10291,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505104,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
459505105,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
459505106,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
459509902,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
459509903,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
CLOUDPLAY,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
GAMESPOT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
GE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
sample_users = low_df['msisdn'].sample(n=10, random_state=1)

recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_cosine(user, user_service_matrix, cosine_sim_df, 2)
    recommendations[user] = recommended_services

recommendations

{'96892422954': ['omantel', 'goaly'],
 '96893615720': ['omantel', 'goaly'],
 '96891417174': ['linguisto', '10291'],
 '2054295844': ['quizy', '10291'],
 '2028907780': ['10291', '10292'],
 '2028880980': ['10291', '10292'],
 '96891768018': ['linguisto', '10291'],
 '2056264925': ['quizy', '10291'],
 '2028039239': ['10291', '10292'],
 '2052383096': ['quizy', '10291']}

In [None]:
low_df['Recommended_Services'] = low_df['msisdn'].apply(lambda x: recommend_services_cosine(x, user_service_matrix, cosine_sim_df))

low_df[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
357,2022000108,slypee,"[10291, 10292]"
359,2022003350,slypee,"[10291, 10292]"
362,2022007595,slypee,"[10291, 10292]"
365,2022007781,slypee,"[10291, 10292]"
376,2022047204,slypee,"[10291, 10292]"


In [None]:
low_df['Recommended_Services'].value_counts()

[quizy, 10291]        768
[linguisto, 10291]    667
[10291, 10292]        505
[omantel, goaly]      194
[goaly, 10291]         27
[10292, 459505104]     21
[omantel, 10291]       13
[gemezz, 10291]         7
[10291, 459505104]      4
Name: Recommended_Services, dtype: int64

In [None]:
low_df[low_df['service'].str.contains(',')]

Unnamed: 0,msisdn,subscription_period_days,revenue_rupiah,country,operator,service,source,status,cycle,adnet,...,unsubs_date,service_price,currency,publisher,attempt_charging,success_billing,CLV,revenue_usd,Segment,Recommended_Services
3440,2052317347,64,137250.0,LA,ltc,"quizy, gemezz",cp1,1,daily,marvel,...,2023-12-12,1000,LAK,lumos,246,178,2144.53125,8.784,Low,"[10291, 10292]"
30280,96871900330,225,575770.6,OM,"omantel, linguisto","linguisto, omantel",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,50,48,2558.980288,36.849316,Low,"[goaly, 10291]"
30518,96872258102,142,155937.9,OM,omantel,"linguisto, goaly",cp1,1,daily,"forest, fmd",...,2023-12-12,300,Baisa,lumos,15,13,1098.153953,9.980023,Low,"[omantel, 10291]"
30677,96872513704,80,155937.9,OM,"linguisto, omantel","omantel, linguisto",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,13,13,1949.223266,9.980023,Low,"[goaly, 10291]"
30942,96872767787,45,47980.88,OM,omantel,"linguisto, goaly",cp1,1,daily,"forest, fmd",...,2023-12-12,300,Baisa,lumos,4,4,1066.241787,3.070776,Low,"[omantel, 10291]"
31273,96890158279,63,107957.0,OM,omantel,"linguisto, goaly",cp1,1,daily,"forest, adcuesta",...,2023-12-12,300,Baisa,lumos,11,9,1713.602871,6.909247,Low,"[omantel, 10291]"
31313,96890191109,277,1079570.0,OM,"omantel, linguisto","linguisto, omantel",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,92,90,3897.363931,69.092468,Low,"[goaly, 10291]"
31360,96890606730,57,383847.0,OM,omantel,"goaly, linguisto",cp1,-1,daily,"fmd, forest",...,2023-12-12,300,Baisa,lumos,32,32,6734.158653,24.566211,Low,"[omantel, 10291]"
31737,96890979408,114,119952.2,OM,"linguisto, omantel","omantel, linguisto",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,12,10,1052.212289,7.676941,Low,"[goaly, 10291]"
31855,96891125551,84,143942.6,OM,"omantel, linguisto","linguisto, omantel",cp1,-1,daily,forest,...,2023-12-12,300,Baisa,lumos,14,12,1713.602871,9.212329,Low,"[goaly, 10291]"


## Medium

In [None]:
medium_expanded = medium_df.assign(service=medium_df['service'].str.split(', ')).explode('service')

user_service_matrix = pd.pivot_table(medium_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)

user_service_matrix.shape, user_service_matrix.head()

((842, 8),
 service     459505105  GE  GED  gemezz  goaly  linguisto  omantel  slypee
 msisdn                                                                   
 2028284237          0   0    0       0      0          0        0       1
 2051133274          0   0    0       1      0          0        0       0
 2051552293          0   0    0       1      0          0        0       0
 2052014197          0   0    0       1      0          0        0       0
 2052024987          0   0    0       1      0          0        0       0)

In [None]:
cosine_sim = cosine_similarity(user_service_matrix.T)

cosine_sim_df = pd.DataFrame(cosine_sim, index=user_service_matrix.columns, columns=user_service_matrix.columns)

cosine_sim_df.iloc[:10, :10]

service,459505105,GE,GED,gemezz,goaly,linguisto,omantel,slypee
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
459505105,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GE,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
GED,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
gemezz,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
goaly,0.0,0.0,0.0,0.0,1.0,0.0,0.017744,0.0
linguisto,0.0,0.0,0.0,0.0,0.0,1.0,0.467707,0.0
omantel,0.0,0.0,0.0,0.0,0.017744,0.467707,1.0,0.0
slypee,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
sample_users = medium_df['msisdn'].sample(n=10, random_state=1)

recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_cosine(user, user_service_matrix, cosine_sim_df)
    recommendations[user] = recommended_services

recommendations

{'96891174502': ['omantel', '459505105'],
 '2058926539': ['459505105', 'GE'],
 '96893656347': ['omantel', '459505105'],
 '2055871818': ['459505105', 'GE'],
 '2076800511': ['459505105', 'GE'],
 '2055582621': ['459505105', 'GE'],
 '2077155303': ['459505105', 'GE'],
 '2058684684': ['459505105', 'GE'],
 '96895900477': ['459505105', 'GE'],
 '2052127828': ['459505105', 'GE']}

In [None]:
medium_df['Recommended_Services'] = medium_df['msisdn'].apply(lambda x: recommend_services_cosine(x, user_service_matrix, cosine_sim_df))
medium_df[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
2169,2028284237,slypee,"[459505105, GE]"
3238,2051133274,gemezz,"[459505105, GE]"
3243,2051552293,gemezz,"[459505105, GE]"
3302,2052014197,gemezz,"[459505105, GE]"
3308,2052024987,gemezz,"[459505105, GE]"


In [None]:
medium_df['Recommended_Services'].value_counts()

[omantel, 459505105]      417
[459505105, GE]           412
[goaly, 459505105]          7
[459505105, GED]            4
[GE, GED]                   1
[linguisto, 459505105]      1
Name: Recommended_Services, dtype: int64

In [None]:
medium_df[medium_df['service'].str.contains(',')]

Unnamed: 0,msisdn,subscription_period_days,revenue_rupiah,country,operator,service,source,status,cycle,adnet,...,unsubs_date,service_price,currency,publisher,attempt_charging,success_billing,CLV,revenue_usd,Segment,Recommended_Services
34431,96892724258,175,1967216.0,OM,"linguisto, omantel","omantel, linguisto",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,166,164,11241.234837,125.90183,Medium,"[goaly, 459505105]"
34478,96892750798,278,5625758.0,OM,"omantel, linguisto","linguisto, omantel",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,469,469,20236.540385,360.048527,Medium,"[goaly, 459505105]"
34495,96892755650,150,1859259.0,OM,"linguisto, omantel","omantel, linguisto",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,155,155,12395.06077,118.992583,Medium,"[goaly, 459505105]"
35290,96893354620,305,23990440.0,OM,"omantel, linguisto","goaly, omantel",cp1,1,daily,"fmd, forest",...,2023-12-12,300,Rial,lumos,4,2,78657.180984,1535.388173,Medium,"[linguisto, 459505105]"
37415,96898269669,234,4438231.0,OM,"linguisto, omantel","omantel, linguisto",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,370,370,18966.801013,284.046812,Medium,"[goaly, 459505105]"
37935,96898914466,86,1043584.0,OM,"omantel, linguisto","linguisto, omantel",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,87,87,12134.699403,66.789386,Medium,"[goaly, 459505105]"
38796,96899341023,77,827670.2,OM,"omantel, linguisto","linguisto, omantel",cp1,1,daily,forest,...,2023-12-12,300,Baisa,lumos,69,69,10748.963466,52.970892,Medium,"[goaly, 459505105]"
38819,96899359827,144,2111159.0,OM,"omantel, linguisto","linguisto, omantel",cp1,-1,daily,forest,...,2023-12-12,300,Baisa,lumos,176,176,14660.824567,135.114159,Medium,"[goaly, 459505105]"


## High

In [None]:
high_expanded = high_df.assign(service=high_df['service'].str.split(', ')).explode('service')
user_service_matrix = pd.pivot_table(high_expanded, index='msisdn', columns='service', aggfunc='size', fill_value=0)
user_service_matrix.shape, user_service_matrix.head()

((1851, 4),
 service      gemezz  goaly  linguisto  omantel
 msisdn                                        
 2054933626        1      0          0        0
 2055383083        1      0          0        0
 96871040071       0      1          0        0
 96871100702       0      1          0        0
 96871102317       0      1          0        0)

In [None]:
cosine_sim = cosine_similarity(user_service_matrix.T)
cosine_sim_df = pd.DataFrame(cosine_sim, index=user_service_matrix.columns, columns=user_service_matrix.columns)
cosine_sim_df.iloc[:10, :10]

service,gemezz,goaly,linguisto,omantel
service,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gemezz,1.0,0.0,0.0,0.0
goaly,0.0,1.0,0.04028,0.023256
linguisto,0.0,0.04028,1.0,0.0
omantel,0.0,0.023256,0.0,1.0


In [None]:
sample_users = high_df['msisdn'].sample(n=10, random_state=1)

recommendations = {}
for user in sample_users:
    recommended_services = recommend_services_cosine(user, user_service_matrix, cosine_sim_df)
    recommendations[user] = recommended_services

recommendations

{'96891743067': ['linguisto', 'omantel'],
 '96893590535': ['linguisto', 'omantel'],
 '96893941274': ['linguisto', 'omantel'],
 '96893550006': ['linguisto', 'omantel'],
 '96891312869': ['linguisto', 'omantel'],
 '96899481122': ['linguisto', 'omantel'],
 '96890685565': ['linguisto', 'omantel'],
 '96891744543': ['linguisto', 'omantel'],
 '96899506336': ['linguisto', 'omantel'],
 '96892217366': ['linguisto', 'omantel']}

In [None]:
high_df['Recommended_Services'] = high_df['msisdn'].apply(lambda x: recommend_services_cosine(x, user_service_matrix, cosine_sim_df))
high_df[['msisdn', 'service', 'Recommended_Services']].head()

Unnamed: 0,msisdn,service,Recommended_Services
4095,2054933626,gemezz,"[goaly, linguisto]"
4314,2055383083,gemezz,"[goaly, linguisto]"
29916,96871040071,goaly,"[linguisto, omantel]"
29917,96871100702,goaly,"[linguisto, omantel]"
29919,96871102317,goaly,"[linguisto, omantel]"


In [None]:
high_df['Recommended_Services'].value_counts()

[linguisto, omantel]    1845
[omantel, gemezz]          3
[goaly, linguisto]         2
[linguisto, gemezz]        1
Name: Recommended_Services, dtype: int64