In [4]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/sample-data-audio - 2-months.csv')

# Convert date to datetime objects for time-based analysis
df['date'] = pd.to_datetime(df['date'])

In [12]:
print(df.columns)

Index(['date', 'user', 'pos_id', 'pos_name', 'product_id', 'product_name',
       'generic_name', 'brand', 'gender_name', 'level1_name', 'level2_name',
       'level3_name', 'pincode', 'city', 'state', 'country', 'price',
       'male_score', 'age<25_score', 'age25-40_score', 'age40+_score'],
      dtype='object')


Defining my Weights

In [5]:
GENDER_WEIGHTS = {
    'gaming': 1.5,      # Male
    'headset': 1.5,     # Male
    'pink': -2.0,       # Female (using negative for the other gender)
    'rose gold': -2.0   # Female
}

AGE_WEIGHTS = {
    # Word -> (<25 score, 25-40 score, 40+ score)
    'gaming': (2.0, 0, 0),
    'portable': (0, 1.0, 0),
    'amplifier': (0, 0, 2.5),
    'audiophile': (0, 0, 3.0)
}

Applying Product Weights

In [6]:
def assign_product_scores(row):
    # Initializing scores for this single transaction
    male_score = 0
    age_under_25_score = 0
    age_25_40_score = 0
    age_over_40_score = 0

    product_name = row['product_name'].lower()

    # Gender scoring
    for keyword, weight in GENDER_WEIGHTS.items():
        if keyword in product_name:
            male_score += weight

    # Age scoring
    for keyword, weights in AGE_WEIGHTS.items():
        if keyword in product_name:
            age_under_25_score += weights[0]
            age_25_40_score += weights[1]
            age_over_40_score += weights[2]

    return male_score, age_under_25_score, age_25_40_score, age_over_40_score

# Applying the function to each row
df[['male_score', 'age<25_score', 'age25-40_score', 'age40+_score']] = df.apply(assign_product_scores, axis=1, result_type='expand')

In [9]:
# Group by user and sum up all their scores
user_scores = df.groupby('user')[['male_score', 'age<25_score', 'age25-40_score', 'age40+_score']].sum()

 Classifying Users

In [10]:
# Classify Gender
user_scores['predicted_gender'] = user_scores['male_score'].apply(lambda x: 'Male' if x > 0 else 'Female')

# Classify Age Group
age_cols = ['age<25_score', 'age25-40_score', 'age40+_score']
user_scores['predicted_age_group'] = user_scores[age_cols].idxmax(axis=1)

# Display the final classification
print(user_scores[['predicted_gender', 'predicted_age_group']].head())

           predicted_gender predicted_age_group
user                                           
user-1               Female        age<25_score
user-10                Male        age<25_score
user-100             Female        age<25_score
user-1000            Female        age<25_score
user-10000           Female      age25-40_score
