<a href="https://colab.research.google.com/github/1911521010UlfatmiHanifa/Kelompok7-APM/blob/main/rule_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount ('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Using import to load a library module into a program’s memory.
import pandas as pd

In [24]:
# Reading persona.csv as pandas DataFrame
df = pd.read_csv('drive/MyDrive/APM/persona.csv')
df.head() # Returns the first 5 rows for quickly checking data

Unnamed: 0,PRICE,SOURCE,SEX,COUNTRY,AGE
0,39,android,male,bra,17
1,39,android,male,bra,17
2,49,android,male,bra,17
3,29,android,male,tur,17
4,49,android,male,tur,17


In [26]:
# Analyze descriptive statistics
df.shape  # Returns a dimensionality of the DataFrame

(5000, 5)

In [27]:
df.describe().T # Generates descriptive statistic

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PRICE,5000.0,34.132,12.464897,9.0,29.0,39.0,39.0,59.0
AGE,5000.0,23.5814,8.995908,15.0,17.0,21.0,27.0,66.0


In [29]:
df.isnull().values.any() # Returns any value is missing in DataFrame

False

In [30]:
df.isnull().sum() #  Returns how many missing values exist in the DataFrame

PRICE      0
SOURCE     0
SEX        0
COUNTRY    0
AGE        0
dtype: int64

In [31]:
# Number of unique <SOURCE>
df["SOURCE"].nunique() # Count number of distinct SOURCE elements


2

In [32]:
df["SOURCE"].value_counts()# Returns counts of SOURCE rows 


android    2974
ios        2026
Name: SOURCE, dtype: int64

In [33]:
df["COUNTRY"].value_counts() # Returns counts of COUNTRY rows

usa    2065
bra    1496
deu     455
tur     451
fra     303
can     230
Name: COUNTRY, dtype: int64

In [34]:
# Country breakdown of income averages

df.groupby("COUNTRY")["PRICE"].agg({"mean"})


Unnamed: 0_level_0,mean
COUNTRY,Unnamed: 1_level_1
bra,34.32754
can,33.608696
deu,34.032967
fra,33.587459
tur,34.78714
usa,34.007264


In [35]:
# Country and Source breakdown of income averages

df.groupby(["COUNTRY", 'SOURCE'])["PRICE"].mean()

COUNTRY  SOURCE 
bra      android    34.387029
         ios        34.222222
can      android    33.330709
         ios        33.951456
deu      android    33.869888
         ios        34.268817
fra      android    34.312500
         ios        32.776224
tur      android    36.229437
         ios        33.272727
usa      android    33.760357
         ios        34.371703
Name: PRICE, dtype: float64

In [None]:
# Average income on the basis of variables,

agg_df = df.groupby(["COUNTRY", 'SOURCE', "SEX", "AGE"])["PRICE"].mean().sort_values(ascending=False)
agg_df.head()

COUNTRY  SOURCE   SEX     AGE
bra      android  male    46     59.0
usa      android  male    36     59.0
fra      android  female  24     59.0
usa      ios      male    32     54.0
deu      android  female  36     49.0
Name: PRICE, dtype: float64

In [None]:
# Convert the index names to variable names

agg_df = agg_df.reset_index() # If False, return a copy. Otherwise, do operation inplace and return None.
agg_df.head() # Lets look at new index numbers

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE
0,bra,android,male,46,59.0
1,usa,android,male,36,59.0
2,fra,android,female,24,59.0
3,usa,ios,male,32,54.0
4,deu,android,female,36,49.0


In [None]:
# Convert AGE variable to categorical variable and adding it to agg_df

my_labels = ['0_18', '19_23', '24_30', '31_40', '41_70']
agg_df["AGE_CUT"] = pd.cut(x=agg_df["AGE"], bins=[0, 18, 23, 30, 40, 70], labels=my_labels)
agg_df.tail(10) # Just checking data

Unnamed: 0,COUNTRY,SOURCE,SEX,AGE,PRICE,AGE_CUT
338,bra,android,male,23,21.5,19_23
339,tur,android,male,21,19.0,19_23
340,tur,ios,male,47,19.0,41_70
341,bra,ios,female,34,19.0,31_40
342,bra,ios,male,47,19.0,41_70
343,usa,ios,female,38,19.0,31_40
344,usa,ios,female,30,19.0,24_30
345,can,android,female,27,19.0,24_30
346,fra,android,male,18,19.0,0_18
347,deu,android,male,26,9.0,24_30


In [None]:
# Identify new level-based customers (Personas)
agg_df["customers_level_based"] = [f"{i[0]}_{i[1]}_{i[2]}_{i[-1]}" for i in agg_df.values]

In [None]:
agg_df = agg_df.loc[:, ["customers_level_based", "PRICE"]].groupby("customers_level_based").agg({"PRICE": "mean"}).sort_values(by="PRICE", ascending=False).reset_index()
agg_df["customers_level_based"].head() # Just checking data again

0    fra_android_female_24_30
1          tur_ios_male_24_30
2          tur_ios_male_31_40
3    tur_android_female_31_40
4      can_android_male_19_23
Name: customers_level_based, dtype: object

In [21]:
# Segment new customers (Personas)

agg_df["SEGMENT"] = pd.qcut(agg_df["PRICE"], 4, labels=["D", "C", "B", "A"])
agg_df.head()

Unnamed: 0,customers_level_based,PRICE,SEGMENT
0,fra_android_female_24_30,45.428571,A
1,tur_ios_male_24_30,45.0,A
2,tur_ios_male_31_40,42.333333,A
3,tur_android_female_31_40,41.833333,A
4,can_android_male_19_23,40.111111,A


In [22]:
# Describe the segments and especially "C"

agg_df.groupby(["SEGMENT"]).agg({"PRICE": ["mean", "max", "sum"]})

agg_df[agg_df["SEGMENT"] == "C"].describe()

Unnamed: 0,PRICE
count,27.0
mean,33.509674
std,0.492587
min,32.5
25%,33.0
50%,33.627634
75%,34.0
max,34.07734


In [23]:
new_user = "fra_android_male_24_30"
print(agg_df[agg_df["customers_level_based"] == new_user])

     customers_level_based  PRICE SEGMENT
74  fra_android_male_24_30   33.0       C
