In [4]:
import pandas as pd
from ftfy import fix_text
from pathlib import Path
import unimib_snowit_project.utils as u
import numpy as np
import dill

In [5]:
# root directory and pkl folder
root_dir_path = u.get_root_dir()
df_in_dir = "data_loaded"
data_pkl_dir_path = root_dir_path.joinpath(df_in_dir)

# pickle filenames
pkl_files = [
    'users.pkl',
    'profiles.pkl',
    'cards.pkl',
    'orders.pkl',
    'order_details.pkl',
    'reviews.pkl',
    'reviews_labelled.pkl'
]

# dynamic generation of paths
pkl_paths = {file_name.split('.')[0]: data_pkl_dir_path.joinpath(file_name) for file_name in pkl_files}

dfs = {}

# loop on files
for name, path in pkl_paths.items():
    dfs[name] = pd.read_pickle(path)
    print(f"{name} loaded: {dfs[name].shape[0]} rows, {dfs[name].shape[1]} columns")

# Now you can access to data through:
users_df = dfs['users']
profiles_df = dfs['profiles']
cards_df = dfs['cards']
orders_df = dfs['orders']   
order_details_df = dfs['order_details']
reviews_df = dfs['reviews'] 
labelled_reviews_df = dfs['reviews_labelled']

users loaded: 728598 rows, 14 columns
profiles loaded: 69025 rows, 10 columns
cards loaded: 626523 rows, 5 columns
orders loaded: 385168 rows, 13 columns
order_details loaded: 993037 rows, 15 columns
reviews loaded: 93429 rows, 3 columns
reviews_labelled loaded: 327522 rows, 3 columns


# CHECK IF ALL COLUMNS OF ALL FILES ARE CORRECT

In [6]:
# ftfy check function
def check_ftfy_needed(df: pd.DataFrame, name: str):
    string_cols = df.select_dtypes(include=['object', 'string'])
    print(f"\nChecking dataset: {name}")
    any_issues = False

    for col in string_cols:

        def safe_check(x):
            try:
                if pd.isna(x):
                    return False
                return str(x) != fix_text(str(x))
            except Exception:
                # if there is an unhandled value, we consider it clean for safety
                return False

        diffs = df[col].apply(safe_check)
    
        if diffs.any():
            print(f"  - Column '{col}' contains text that needs cleaning")
            any_issues = True
        else:
            print(f"  - Column '{col}' seems already clean")
    
    if not any_issues:
        print("  All text columns seem already clean ✅")


# loop through all DataFrames already loaded
for name, df in dfs.items():
    check_ftfy_needed(df, name)


Checking dataset: users
  - Column 'user.uid' seems already clean
  - Column 'source' seems already clean
  - Column 'city' seems already clean
  - Column 'language' seems already clean
  - Column 'referral.medium' seems already clean
  - Column 'referral.source' seems already clean
  - Column 'favouriteZones' contains text that needs cleaning

Checking dataset: profiles
  - Column 'user.uid' seems already clean
  - Column 'profile.uid' seems already clean
  - Column 'sex' seems already clean
  - Column 'city' seems already clean
  - Column 'level' seems already clean
  - Column 'types' seems already clean
  All text columns seem already clean ✅

Checking dataset: cards
  - Column 'card.uid' seems already clean
  - Column 'status' seems already clean
  - Column 'user.uid' seems already clean
  All text columns seem already clean ✅

Checking dataset: orders
  - Column 'order.uid' seems already clean
  - Column 'user.uid' seems already clean
  - Column 'paymentGateway' seems already cle

the columns 
- 'favouriteZones' of users_df,
- 'item.zoneName', 'item.variantName', 'item.slotName' of  order_details_df
- 'text' of reviews_df,
- 'text' of reviews_labelled_df 

need more adjusments

## CHECK WHY COLUMNS NEED ADJUSTMENTS

In [7]:
# Boolean mask: rows where ftfy would change the value
def needs_fix(x):
    try:
        if pd.isna(x):
            return False
        return str(x) != fix_text(str(x))
    except Exception:
        return False

In [8]:
def fix_value(x):
    try:
        # Leave NaN or None unchanged
        if x is None or (isinstance(x, float) and np.isnan(x)):
            return x
        
        # If x is a list, fix each element
        if isinstance(x, list):
            return [fix_text(str(item)) for item in x]
        
        # Otherwise, fix the single value
        return fix_text(str(x))
    except Exception:
        # In case of unexpected values, return original
        return x

### USERS DF

In [9]:
col = 'favouriteZones'  # column to check

to_fix = users_df[col].apply(needs_fix)

# Count total problematic rows
num_to_fix = to_fix.sum()
print(f"Total rows that would need fixing in '{col}': {num_to_fix}")

if num_to_fix > 0:
    print("\nProblematic values (original vs corrected):")

    # Take at most 5 rows for examples
    problematic_rows = users_df.loc[to_fix, col]
    # Convert lists to tuples for uniqueness
    unique_values = set()
    for val in problematic_rows:
        if isinstance(val, list):
            unique_values.add(tuple(val))
        else:
            unique_values.add(val)

    # Print unique values
    for val in unique_values:
        if isinstance(val, tuple):
            corrected = [fix_text(str(x)) for x in val]
            print(f"\nOriginal: {val}")
            print(f"Corrected: {corrected}")
        else:
            print(f"\nOriginal: {val}")
            print(f"Corrected: {fix_text(str(val))}")
    
    print("\n-----------------------------------")
    print(f"\nUnique rows that needed fixing: {len(unique_values)}")
else:
    print("All values appear to be clean ✅")

Total rows that would need fixing in 'favouriteZones': 54

Problematic values (original vs corrected):

Original: ('Gressoney-La-Trinit√© - Monterosa Ski',)
Corrected: ['Gressoney-La-Trinité - Monterosa Ski']

-----------------------------------

Unique rows that needed fixing: 1


In [10]:
# Apply the correction directly to the existing column
users_df[col] = users_df[col].apply(fix_value)

# Quick check of the corrected values
print(users_df.loc[users_df['favouriteZones'].apply(lambda x: isinstance(x, list) and len(x) > 0), 'favouriteZones'])

278       [Alagna - Monterosa Ski, Gressoney-La-Trinité ...
347                                                  [Pila]
382                                                  [Pila]
431                  [Chiesa Valmalenco Bernina Ski Resort]
460                                                  [Pila]
                                ...                        
728232                           [Corno alle Scale, Cimone]
728239    [Andalo, Badia - Alta Badia, Cimone, Corno all...
728363                                           [Folgaria]
728429                                            [Livigno]
728493                                            [Foppolo]
Name: favouriteZones, Length: 4631, dtype: object


### ORDER DETAILS DF

In [11]:
# Columns to check
cols = ['item.zoneName', 'item.variantName', 'item.slotName']

for col in cols:
    print(f"\n--- Column: {col} ---")

    to_fix = order_details_df[col].apply(needs_fix)

    # Count total problematic rows
    num_to_fix = to_fix.sum()
    print(f"Total rows that would need fixing: {num_to_fix}")

    if num_to_fix == 0:
        print("All values appear to be clean ✅")
        continue

    # Select problematic rows
    problematic_rows = order_details_df.loc[to_fix, col]

    # Collect unique problematic values
    unique_values = set()
    for val in problematic_rows:
        if isinstance(val, list):
            unique_values.add(tuple(val))  # convert lists to tuples for uniqueness
        else:
            unique_values.add(val)

    # Print unique values and their corrected version
    print("\nUnique problematic values (original vs corrected):")
    for val in unique_values:
        if isinstance(val, tuple):
            corrected = [fix_text(str(x)) for x in val]
            print(f"\nOriginal: {val}")
            print(f"Corrected: {corrected}")
        else:
            print(f"\nOriginal: {val}")
            print(f"Corrected: {fix_text(str(val))}")

    print(f"\nUnique rows that needed fixing: {len(unique_values)}")


--- Column: item.zoneName ---
Total rows that would need fixing: 2

Unique problematic values (original vs corrected):

Original: Lago d’Iseo e Franciacorta
Corrected: Lago d'Iseo e Franciacorta

Unique rows that needed fixing: 1

--- Column: item.variantName ---
Total rows that would need fixing: 3

Unique problematic values (original vs corrected):

Original: E-Track 24”
Corrected: E-Track 24"

Original: Rossignol E-Track 24”
Corrected: Rossignol E-Track 24"

Unique rows that needed fixing: 2

--- Column: item.slotName ---
Total rows that would need fixing: 1

Unique problematic values (original vs corrected):

Original: 24”
Corrected: 24"

Unique rows that needed fixing: 1


In [12]:
# to apply the corrections we use the function previosly defined
# Apply the correction directly to each column
for col in cols:
    order_details_df[col] = order_details_df[col].apply(fix_value)

# Quick check
order_details_df[cols].head()

Unnamed: 0,item.zoneName,item.variantName,item.slotName
0,Bormio,Rossignol Mandate Shift,L
18,Desenzano del Garda,,
19,Desenzano del Garda,,
20,Desenzano del Garda,,
21,Desenzano del Garda,,


### Reviews DF

In [13]:
col = 'text'  # column to check

to_fix = reviews_df[col].apply(needs_fix)

# Count total problematic rows
num_to_fix = to_fix.sum()
print(f"Total rows that would need fixing in '{col}': {num_to_fix}")

if num_to_fix > 0:
    print("\nProblematic values (original vs corrected):")

    # Take at most 5 rows for examples
    problematic_rows = reviews_df.loc[to_fix, col]
    # Convert lists to tuples for uniqueness
    unique_values = set()
    for val in problematic_rows:
        if isinstance(val, list):
            unique_values.add(tuple(val))
        else:
            unique_values.add(val)
   

    # Print unique values
    for val in list(unique_values)[:5]:
        if isinstance(val, tuple):
            corrected = [fix_text(str(x)) for x in val]
            print(f"\nOriginal: {val}")
            print(f"Corrected: {corrected}")
        else:
            print(f"\nOriginal: {val}")
            print(f"Corrected: {fix_text(str(val))}")
    
    print("\n-----------------------------------")
    print(f"\nUnique rows that needed fixing: {len(unique_values)}")
else:
    print("All values appear to be clean ✅")

Total rows that would need fixing in 'text': 103

Problematic values (original vs corrected):

Original: Recieved quickly. The one pound bag is huge....came in compressed air tight foil bag. Just recently read about how this was the medicinal one to use, "The amazing healing properties of Manoka honey and cinnamon".[...]&lt;This is where infor came from. I've had miraculous results with the manuka honey, hope adding this cinnamon adds to it. I got the Manuka honey from Amazon 20%... 15% works just as well from Whole Foods or Amazon. I bought capsules a while back...read the back label & they were the cheaper Cassia cinnamon. You don't want Chinese or Saigon or Cassia...only use Ceylon. One bag of these cinnamon sticks could be shared with a few friends...such a large quantity. I like to grind fresh, or you can get the powdered.
Corrected: Recieved quickly. The one pound bag is huge....came in compressed air tight foil bag. Just recently read about how this was the medicinal one to use,

In [14]:
# to apply the corrections we use the function previously defined

# Apply the correction directly to the existing column
reviews_df[col] = reviews_df[col].apply(fix_value)

# Quick check of the corrected values
reviews_df[col].head()

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
Name: text, dtype: object

### Reviewes Labelled DF

In [15]:
col = 'text'  # column to check

to_fix = labelled_reviews_df[col].apply(needs_fix)

# Count total problematic rows
num_to_fix = to_fix.sum()
print(f"Total rows that would need fixing in '{col}': {num_to_fix}")

if num_to_fix > 0:
    print("\nProblematic values (original vs corrected):")

    # Take at most 5 rows for examples
    problematic_rows = labelled_reviews_df.loc[to_fix, col]
    # Convert lists to tuples for uniqueness
    unique_values = set()
    for val in problematic_rows:
        if isinstance(val, list):
            unique_values.add(tuple(val))
        else:
            unique_values.add(val)
   

    # Print unique values
    for val in list(unique_values)[:5]:
        if isinstance(val, tuple):
            corrected = [fix_text(str(x)) for x in val]
            print(f"\nOriginal: {val}")
            print(f"Corrected: {corrected}")
        else:
            print(f"\nOriginal: {val}")
            print(f"Corrected: {fix_text(str(val))}")
    
    print("\n-----------------------------------")
    print(f"\nUnique rows that needed fixing: {len(unique_values)}")
else:
    print("All values appear to be clean ✅")

Total rows that would need fixing in 'text': 428

Problematic values (original vs corrected):

Original: This is the best strawberry jam ever.  We've tried all the gourmet brands as well as homemade jams from the farmers market but this beats them all by a mile.  It tastes less like sugar and more like ripe strawberries.  The texture is also better because the fruit is pur&eacute;ed so it spreads smoothly.  AND it's organic. :-)
Corrected: This is the best strawberry jam ever.  We've tried all the gourmet brands as well as homemade jams from the farmers market but this beats them all by a mile.  It tastes less like sugar and more like ripe strawberries.  The texture is also better because the fruit is puréed so it spreads smoothly.  AND it's organic. :-)

Original: El Pato Jalape&ntilde;o Salsa is a great tasting, versatile tomato product.  The sauce can be used straight out of the can for chip-dipping salsa, or it could be used in a variety of recipes.  For example, it could make a gr

In [16]:
# to apply the corrections we use the function previosly defined

# Apply the correction directly to the existing column
labelled_reviews_df[col] = labelled_reviews_df[col].apply(fix_value)

# Quick check of the corrected values
labelled_reviews_df[col].head()

0    I'm no bitters expert but I bought it as a gif...
1    these are probably great in the right drinks, ...
2    I sent these to my dad for his bday and he sai...
3    I purchased these as a gift for family member ...
4    My wife bought me this sauce sampler for Chris...
Name: text, dtype: object

# HANDLING MISSING VALUES

In [17]:
def basic_info(df, name):
    print(f"\n===== {name} =====")
    print(f"Shape: {df.shape}")
    print(df.info())
    print("Missing values per column:")
    print(df.isnull().sum())
    print("-"*50)

## USERS DF

In [18]:
basic_info(users_df, "Users")


===== Users =====
Shape: (728598, 14)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728598 entries, 0 to 728597
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   user.uid         728598 non-null  string        
 1   createdAt        715839 non-null  datetime64[ns]
 2   source           728598 non-null  string        
 3   isAnonymous      659772 non-null  boolean       
 4   referralsCount   177554 non-null  Int64         
 5   city             329391 non-null  object        
 6   language         493253 non-null  string        
 7   googleId         153914 non-null  boolean       
 8   appleId          11647 non-null   boolean       
 9   facebookId       62775 non-null   boolean       
 10  referral.medium  22474 non-null   object        
 11  referral.source  23199 non-null   object        
 12  referral.type    23117 non-null   Int64         
 13  favouriteZones   728598 non-null  o

In [19]:
for col in ['appleId','googleId','facebookId']:
    uniques = users_df[col].unique()
    print(f"\n{col} → {len(uniques)} unique values")
    print(uniques) 


appleId → 2 unique values
<BooleanArray>
[<NA>, True]
Length: 2, dtype: boolean

googleId → 2 unique values
<BooleanArray>
[True, <NA>]
Length: 2, dtype: boolean

facebookId → 2 unique values
<BooleanArray>
[<NA>, True]
Length: 2, dtype: boolean


In [20]:
# false values are interpeted as missing, so I change them

for col in ['appleId', 'googleId', 'facebookId']:
    users_df[col] = users_df[col].fillna(False).astype(bool)
    uniques = users_df[col].unique()
    print(f"{col} → {uniques}")

appleId → [False  True]
googleId → [ True False]
facebookId → [False  True]


In [21]:
users_df.isnull().sum() 

user.uid                0
createdAt           12759
source                  0
isAnonymous         68826
referralsCount     551044
city               399207
language           235345
googleId                0
appleId                 0
facebookId              0
referral.medium    706124
referral.source    705399
referral.type      705481
favouriteZones          0
dtype: int64

In [22]:
users_df[users_df['referralsCount'].isna()].isnull().sum()
# the majority of the users with missing referralsCount have also missing values in referral.medium, referral.source, referral.type

user.uid                0
createdAt           10371
source                  0
isAnonymous         68764
referralsCount     551044
city               328790
language           194879
googleId                0
appleId                 0
facebookId              0
referral.medium    533606
referral.source    532842
referral.type      533051
favouriteZones          0
dtype: int64

In [23]:
# Set missing referralsCount to 0
users_df['referralsCount'] = users_df['referralsCount'].fillna(0).astype(int)

# Set missing referral.medium and referral.source to "none"
for col in ['referral.medium','referral.source']:
    users_df[col] = users_df[col].fillna("none")

# Set missing referral.type to -1  
users_df['referral.type'] = users_df['referral.type'].fillna(-1).astype(int)


In [24]:
users_df['isAnonymous'].unique()

<BooleanArray>
[False, True, <NA>]
Length: 3, dtype: boolean

In [25]:
users_df[users_df['isAnonymous'].isna()].shape
# 68826 null over 728598 rows

(68826, 14)

In [26]:
filtered = users_df[
    (users_df["appleId"] == False) &
    (users_df["googleId"] == False) &
    (users_df["facebookId"] == False) &
    (users_df["isAnonymous"].isna())
]

filtered.shape

(39046, 14)

In [27]:
# Set missing isAnonymous to True when the users didn't register neither via Apple nor via Google nor via Facebook

users_df.loc[
    (users_df["appleId"] == False) &
    (users_df["googleId"] == False) &
    (users_df["facebookId"] == False) &
    (users_df["isAnonymous"].isna()),
    "isAnonymous"
] = True

In [28]:
# Set resting missing isAnonymous to False
users_df['isAnonymous'] = users_df['isAnonymous'].fillna(False).astype(bool)

In [29]:
users_df.isnull().sum() 

user.uid                0
createdAt           12759
source                  0
isAnonymous             0
referralsCount          0
city               399207
language           235345
googleId                0
appleId                 0
facebookId              0
referral.medium         0
referral.source         0
referral.type           0
favouriteZones          0
dtype: int64

In [30]:
# column 'language isn't handled since the main information can be retrieved from the 'city' feature

In [31]:
# Set missing city and language to "none"
for col in ['city','language']:
    users_df[col] = users_df[col].fillna("none")

In [32]:
users_df.isnull().sum() 

# since the 'createdAt' feature is essential for the following analysis related to the RFM and Churn models, it is temporally kept with some null values that
# will be carefully handled when necessary

user.uid               0
createdAt          12759
source                 0
isAnonymous            0
referralsCount         0
city                   0
language               0
googleId               0
appleId                0
facebookId             0
referral.medium        0
referral.source        0
referral.type          0
favouriteZones         0
dtype: int64

## PROFILES DF

In [33]:
basic_info(profiles_df, "Profiles")


===== Profiles =====
Shape: (69025, 10)
<class 'pandas.core.frame.DataFrame'>
Index: 69025 entries, 1 to 97157
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   user.uid      69025 non-null  string        
 1   profile.uid   69025 non-null  string        
 2   birthday      64521 non-null  datetime64[ns]
 3   sex           49069 non-null  object        
 4   city          8932 non-null   object        
 5   height        53139 non-null  Float64       
 6   weight        52958 non-null  float64       
 7   skibootsSize  52937 non-null  Float64       
 8   level         54138 non-null  string        
 9   types         69025 non-null  object        
dtypes: Float64(2), datetime64[ns](1), float64(1), object(3), string(3)
memory usage: 5.9+ MB
None
Missing values per column:
user.uid            0
profile.uid         0
birthday         4504
sex             19956
city            60093
height          158

As expected some user.uid are associated with more profiles.uid 

In [34]:
# Evaluate how many profile.uid for each user.uid
profiles_per_user = profiles_df.groupby("user.uid")["profile.uid"].nunique().reset_index()

# Filter only those with more than 1 profile
multi_profiles = profiles_per_user[profiles_per_user["profile.uid"] > 1]
multi_profiles.columns = ["user.uid", "Number of profiles"]

print(f"Number of user.uid with more than one profile: {multi_profiles.shape[0]}")


Number of user.uid with more than one profile: 14494


In [35]:
multi_profiles.groupby("Number of profiles").count()

Unnamed: 0_level_0,user.uid
Number of profiles,Unnamed: 1_level_1
2,9732
3,2804
4,1263
5,398
6,132
7,58
8,33
9,18
10,7
11,13


We have a lot of users with 2-10 profiles, a few with 11-18 profiles and some users that have created from 19 to 81 different profiles.

In [36]:
profiles_df.isnull().sum() 

user.uid            0
profile.uid         0
birthday         4504
sex             19956
city            60093
height          15886
weight          16067
skibootsSize    16088
level           14887
types               0
dtype: int64

we have 60093 missing values in the 'city' column, so we can consider imputing them by using the 'city' column of the users dataset

In [37]:
merged_df = pd.merge(
    users_df[['user.uid', 'city']], 
    profiles_df[['user.uid', 'city']], 
    on='user.uid', 
    how='inner',  # only consider users present in both datasets
    suffixes=('_users', '_profiles')
)

# create a column to check if the cities match
merged_df['city_match'] = merged_df['city_users'] == merged_df['city_profiles']


num_match = merged_df['city_match'].sum()
num_mismatch = (~merged_df['city_match']).sum()

print(f"Matching cities: {num_match}")
print(f"Mismatched cities: {num_mismatch}")


print(f"Mismatching cities because of missing values in profiles: {merged_df['city_profiles'].isnull().sum()}")

Matching cities: 4829
Mismatched cities: 64196
Mismatching cities because of missing values in profiles: 60093


In [38]:
user_without_city = merged_df[(merged_df['city_match']==False) & (merged_df['city_profiles'].isnull())][['user.uid', 'city_users']]
user_without_city

Unnamed: 0,user.uid,city_users
0,09gqlmpl9bn82gwyct3aetbcdp,aachen
2,lvfkrnkmzixk7nnnuns78po6ku,aarschot
3,lvfkrnkmzixk7nnnuns78po6ku,aarschot
4,wmt7ktgwiwhb8qswewseu76e0r,abano terme
5,j2bmupwsjofrlqcmlrwdfmvrqh,abano terme
...,...,...
69020,cup5j32rq5fryxrrc1qns4lsyd,none
69021,cxrgfedtvwhnshcejppspovzxe,none
69022,cxrgfedtvwhnshcejppspovzxe,none
69023,cxrgfedtvwhnshcejppspovzxe,none


In [39]:
# create mapping user.uid -> city_users
city_map = dict(zip(user_without_city["user.uid"], user_without_city["city_users"]))


mask = (profiles_df["user.uid"].isin(city_map.keys())) & (profiles_df["city"].isnull())
profiles_df.loc[mask, "city"] = profiles_df.loc[mask, "user.uid"].map(city_map)

print(profiles_df.loc[mask, ["user.uid", "city"]].head())

                        user.uid         city
4301  009jfacn0eo5nynrv5p3tt2s7h  montelupone
4302  00j7rrt6ebzgqzu7epdl2pgmfo      firenze
4303  00koqkoxhhwx4egpnlugfmz63h       milano
4304  00p4czyqrge7lsrac6npbsqfap         lodi
4305  00p4czyqrge7lsrac6npbsqfap         lodi


In [40]:
profiles_df.isna().sum()

user.uid            0
profile.uid         0
birthday         4504
sex             19956
city                0
height          15886
weight          16067
skibootsSize    16088
level           14887
types               0
dtype: int64

In [41]:
# an idea to deal with height, weight and skibootSize could have been to consider the median given the sex and the birthday, but the following table shows that the rows with missing
# height, weight and skibootsSize AND sex, are essentialy the same

In [42]:
profiles_df[
    (profiles_df["height"].isna()) &
    (profiles_df["weight"].isna()) &
    (profiles_df["skibootsSize"].isna()) &
    (profiles_df["sex"].isna())
].shape

(14958, 10)

In [43]:
# a more standard imputation follows then:

for col in ['height','weight', 'skibootsSize']:
    profiles_df[col] = profiles_df[col].fillna(-1)

for col in ['sex','level']:
    profiles_df[col] = profiles_df[col].fillna("-1")

In [44]:
profiles_df.isna().sum()

user.uid           0
profile.uid        0
birthday        4504
sex                0
city               0
height             0
weight             0
skibootsSize       0
level              0
types              0
dtype: int64

## CARDS DF

In [45]:
basic_info(cards_df, "Cards")


===== Cards =====
Shape: (626523, 5)
<class 'pandas.core.frame.DataFrame'>
Index: 626523 entries, 1 to 805836
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   card.uid    626523 non-null  string        
 1   assignedAt  183607 non-null  datetime64[ns]
 2   birthday    166055 non-null  datetime64[ns]
 3   status      626523 non-null  string        
 4   user.uid    184781 non-null  string        
dtypes: datetime64[ns](2), string(3)
memory usage: 28.7 MB
None
Missing values per column:
card.uid           0
assignedAt    442916
birthday      460468
status             0
user.uid      441742
dtype: int64
--------------------------------------------------


Remove rows with null user.uid since they would be useless for analysis (can't link the order to any user)

In [46]:
cards_df = cards_df[cards_df['user.uid'].notnull()]

In [47]:
cards_df.isna().sum()

card.uid          0
assignedAt     1992
birthday      18907
status            0
user.uid          0
dtype: int64

In [48]:
# as below, missing data would be handled when strictly necessary due to the importance of the variable

## ORDERS DF

In [49]:
basic_info(orders_df, "Orders")


===== Orders =====
Shape: (385168, 13)
<class 'pandas.core.frame.DataFrame'>
Index: 385168 entries, 0 to 549899
Data columns (total 13 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   order.uid        385168 non-null  string        
 1   user.uid         385168 non-null  string        
 2   createdAt        385168 non-null  datetime64[ns]
 3   createdAtTime    385168 non-null  datetime64[ns]
 4   paymentGateway   385168 non-null  string        
 5   paymentBrand     251908 non-null  string        
 6   pickup           385168 non-null  boolean       
 7   pickupComplete   385168 non-null  boolean       
 8   source           385000 non-null  string        
 9   tenant           384732 non-null  string        
 10  paymentAttempts  385168 non-null  Int64         
 11  timeZone         114872 non-null  string        
 12  clientInfo       385168 non-null  string        
dtypes: Int64(1), boolean(2), datetime64[ns]

In [50]:
for col in ['paymentBrand','source','tenant']:
    orders_df[col] = orders_df[col].fillna("none")

In [51]:
col='timeZone'
orders_df[col] = orders_df[col].fillna("other")

In [52]:
orders_df.isna().sum()

order.uid          0
user.uid           0
createdAt          0
createdAtTime      0
paymentGateway     0
paymentBrand       0
pickup             0
pickupComplete     0
source             0
tenant             0
paymentAttempts    0
timeZone           0
clientInfo         0
dtype: int64

## ORDERS DETAILS DF

In [53]:
basic_info(order_details_df, "Order Details")


===== Order Details =====
Shape: (993037, 15)
<class 'pandas.core.frame.DataFrame'>
Index: 993037 entries, 0 to 1420602
Data columns (total 15 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   item.uid                993037 non-null  string        
 1   order.uid               993037 non-null  string        
 2   item.date               956773 non-null  datetime64[ns]
 3   product.uid             993037 non-null  string        
 4   product.dynamicPricing  993037 non-null  boolean       
 5   item.amount             993037 non-null  Float32       
 6   item.discount           993037 non-null  boolean       
 7   product.type            993037 non-null  string        
 8   item.zoneName           993037 non-null  object        
 9   product.durationHours   838534 non-null  Float32       
 10  item.profiles           36440 non-null   string        
 11  item.variantName        993037 non-null  object 

In [54]:
# trying to impute item.profiles...

In [55]:
# Merge order_details with orders
details_orders = pd.merge(
    order_details_df,
    orders_df[["order.uid", "user.uid"]], 
    on="order.uid",
    how="inner"
)

# Merge with profiles on user.uid
details_orders_profiles = pd.merge(
    details_orders,
    profiles_df[["user.uid", "profile.uid"]],
    on="user.uid",
    how="inner"
)

# Extract, for each user.uid, the list of associated profile.uid
user_profiles = (
    details_orders_profiles.groupby("user.uid")["profile.uid"]
    .unique()   
)

user_profiles

user.uid
00c2zutvhrg2wsots6dqti4acu                          [2u6fgglofytfkorfqwjr]
00g2ppqciodr2kux1j44mnp9a3                          [cet18sb27ncebvu7kvyl]
00j7rrt6ebzgqzu7epdl2pgmfo                          [cuig5qrqxafdgd3g6azy]
00koqkoxhhwx4egpnlugfmz63h                          [ddjxqn8bghajd9me60nl]
00mr3b8fzvthxearlfpaamtmtr                          [v7tyzgvhlbafsfzbd8mv]
                                                  ...                     
zztkbfxq5sa5ofpun0df5jddgc                          [cgyskv7mks2xggnx9ivk]
zztsqurlqbakodjb55nn3urtlm                          [pa38lutrucpujlq96cfx]
zzvnki0kv1vlaq4tztvbthtvfc                          [hqqnb0xi1o1uaktd34cu]
zzxaa4iuo4fnkdl2q13q71iu6g                          [far2podl7bhin7u4tds2]
zzxhbqd23nyekppw9fvbu5y1sn    [7ul1yl0vozkj4odjftwx, ox9cqfkgsnn4bvo5kndk]
Name: profile.uid, Length: 32314, dtype: object

In [56]:
# since a user can be related to more profiles, and the order is only related with the user, I can't have certainty about the profiles related to that specific order
order_details_df['item.profiles'] = order_details_df['item.profiles'].fillna("none")

In [57]:
# moving to item.snowitcardNumber...

In [58]:
# Merge order_details with orders
details_orders = pd.merge(
    order_details_df,
    orders_df[["order.uid", "user.uid"]],  
    on="order.uid",
    how="inner"
)

# Merge with cards on user.uid
details_orders_cards = pd.merge(
    details_orders,
    cards_df[["user.uid", "card.uid"]],
    on="user.uid",
    how="inner"
)

# Extract, for each user.uid, the list of associated card.uid
user_cards = (
    details_orders_cards.groupby("user.uid")["card.uid"]
    .unique()   
    .reset_index()
)

# Find users with just ONE skipass
single_card_users = (
    user_cards[user_cards["card.uid"].str.len() == 1] 
)
single_card_users


Unnamed: 0,user.uid,card.uid
0,001vktlc5zbkx2bazqkwmrpru9,[02138143]
2,005dpt4a0puxgicccu6ixvlc9f,[01161471599232720540808]
3,00bua0ypencroqp5mgcohofq2z,[07754863]
6,00dija3tzdwkjxszdmgnqjtd8t,[2417458]
8,00elzcypz9btvvtxxpu74qu6qc,[01161471599233125552422]
...,...,...
81805,zzuz1bgyk6w7mcoknhxb2gj2kl,[08822257]
81807,zzv4z2ddfmeahyytarujh8fn4v,[02022789]
81811,zzwaicicjdwjxtqrln1f4eougn,[03563380]
81813,zzwyguvjxtzadqq0fcswvxiwpk,[03241961]


In [59]:
# this time, I recall the skipass number through a double merge, only if the list of skipass associated with a user is
# made by a single skipass

In [60]:
# merge order_details with orders
details_orders = pd.merge(
    order_details_df[["order.uid", "product.type", "item.snowitcardNumber"]],
    orders_df[["order.uid", "user.uid"]],
    on="order.uid",
    how="inner"
)

details_orders

Unnamed: 0,order.uid,product.type,item.snowitcardNumber,user.uid
0,4y9zqqvldfqr9n2xnu,rental~bike,,nu7fhz41rzwbkabapcufr6g18r
1,i9eovtgp3uxbmesebv,experience,,hkxsbgefntzwsodvj3ztvp3gm1
2,i9eovtgp3uxbmesebv,bundle~train,,hkxsbgefntzwsodvj3ztvp3gm1
3,i9eovtgp3uxbmesebv,transport,,hkxsbgefntzwsodvj3ztvp3gm1
4,i9eovtgp3uxbmesebv,service,,hkxsbgefntzwsodvj3ztvp3gm1
...,...,...,...,...
993032,f8zj28hdatvraibekk,experience,,asyeuoax8uaw3np3tlyxpcfxzm
993033,f8zj28hdatvraibekk,experience,,asyeuoax8uaw3np3tlyxpcfxzm
993034,f8zj28hdatvraibekk,experience,,asyeuoax8uaw3np3tlyxpcfxzm
993035,wy69h8jsgozdrxlu9i,skipass,,wfsihoehobxkipt3731virtjqq


In [61]:
details_with_card = pd.merge(
    details_orders,
    single_card_users,
    on="user.uid",
    how="inner"
)

details_with_card[details_with_card['item.snowitcardNumber'].isna() & details_with_card['card.uid'].notna()]

Unnamed: 0,order.uid,product.type,item.snowitcardNumber,user.uid,card.uid
0,berfovxwyu72mzwmei,giftcard,,angticjoeipyve4gpwgwd7nv53,[07240571]
2,sgo1b64fhatmepxi1z,skipass,,kllr731wlmbb8njvb1v1bavris,[06758428]
3,sgo1b64fhatmepxi1z,skipass,,kllr731wlmbb8njvb1v1bavris,[06758428]
4,sgo1b64fhatmepxi1z,skipass,,kllr731wlmbb8njvb1v1bavris,[06758428]
6,sgo1b64fhatmepxi1z,skipass,,kllr731wlmbb8njvb1v1bavris,[06758428]
...,...,...,...,...,...
251788,2zre7jvgxbif6cdpqt,skipass,,rmoccnmpqfwrz3jlzhxvt7udkj,[07420209]
251789,34xazujbk01kpe0bgr,skipass,,xzxxkfrcmsyfcqqrcohlba5wkn,[03563623]
251790,34xazujbk01kpe0bgr,skipass,,xzxxkfrcmsyfcqqrcohlba5wkn,[03563623]
251791,g0tkwbe686jnynbxs9,experience,,wsunenfdtesnbkfiqjhahhsofp,[01161471335350321318942]


In [62]:
# map order.uid -> card.uid 
order_card_map = (
    details_with_card
    .dropna(subset=["card.uid"])
    .assign(card_uid_str=lambda df: df["card.uid"].astype(str)) 
    .set_index("order.uid")["card_uid_str"]
    .to_dict()
)

# fill NaN in order_details_df
order_details_df["item.snowitcardNumber"] = order_details_df["item.snowitcardNumber"].fillna(
    order_details_df["order.uid"].map(order_card_map)
)

In [63]:
order_details_df.isna().sum()
# I'v reduced the null values in the item.snowitcardNumber from 704766 to 544706, the others are set to 'none'

item.uid                       0
order.uid                      0
item.date                  36264
product.uid                    0
product.dynamicPricing         0
item.amount                    0
item.discount                  0
product.type                   0
item.zoneName                  0
product.durationHours     154503
item.profiles                  0
item.variantName               0
item.slotName                  0
item.snowitcardNumber     544706
item.status                    0
dtype: int64

In [64]:
order_details_df['item.snowitcardNumber'] = order_details_df['item.snowitcardNumber'].fillna("none")

In [65]:
order_details_df['product.durationHours'] = order_details_df['product.durationHours'].fillna(-99)

In [66]:
order_details_df.isna().sum()

item.uid                      0
order.uid                     0
item.date                 36264
product.uid                   0
product.dynamicPricing        0
item.amount                   0
item.discount                 0
product.type                  0
item.zoneName                 0
product.durationHours         0
item.profiles                 0
item.variantName              0
item.slotName                 0
item.snowitcardNumber         0
item.status                   0
dtype: int64

## REVIEWS DF

In [67]:
basic_info(reviews_df, "Reviews")


===== Reviews =====
Shape: (93429, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93429 entries, 0 to 93428
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   review.uid  93429 non-null  string
 1   user.uid    93429 non-null  string
 2   text        93429 non-null  object
dtypes: object(1), string(2)
memory usage: 2.1+ MB
None
Missing values per column:
review.uid    0
user.uid      0
text          0
dtype: int64
--------------------------------------------------


## LABELLED REVIEWS DF

In [68]:
basic_info(labelled_reviews_df, "Labelled Reviews")


===== Labelled Reviews =====
Shape: (327522, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327522 entries, 0 to 327521
Data columns (total 3 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   labelled_review.uid  327522 non-null  string
 1   text                 327522 non-null  object
 2   sentiment_label      327522 non-null  string
dtypes: object(1), string(2)
memory usage: 7.5+ MB
None
Missing values per column:
labelled_review.uid    0
text                   0
sentiment_label        0
dtype: int64
--------------------------------------------------


# SAVE CORRECTED DATASETS

In [69]:
users_pkl_path = pkl_paths['users']
with users_pkl_path.open('wb') as fh:
    dill.dump(users_df, fh)

print(f"Corrected users data saved in {users_pkl_path.as_posix()}")
####

profiles_pkl_path = pkl_paths['profiles']
with profiles_pkl_path.open('wb') as fh:
    dill.dump(profiles_df, fh)

print(f"Corrected profiles data saved in {profiles_pkl_path.as_posix()}")
####

cards_pkl_path = pkl_paths['cards']
with cards_pkl_path.open('wb') as fh:
    dill.dump(cards_df, fh)

print(f"Corrected cards data saved in {cards_pkl_path.as_posix()}") 

####
orders_pkl_path = pkl_paths['orders']
with orders_pkl_path.open('wb') as fh:
    dill.dump(orders_df, fh)

print(f"Corrected orders data saved in {orders_pkl_path.as_posix()}") 

####
order_details_pkl_path = pkl_paths['order_details']
with order_details_pkl_path.open('wb') as fh:
    dill.dump(order_details_df, fh)

print(f"Corrected order details data saved in {order_details_pkl_path.as_posix()}") 

####
reviews_pkl_path = pkl_paths['reviews']
with reviews_pkl_path.open('wb') as fh:
    dill.dump(reviews_df, fh)

print(f"Corrected reviews data saved in {reviews_pkl_path.as_posix()}") 

####
reviews_labelled_pkl_path = pkl_paths['reviews_labelled']
with reviews_labelled_pkl_path.open('wb') as fh:
    dill.dump(labelled_reviews_df, fh)

print(f"Corrected laballed reviews data saved in {reviews_labelled_pkl_path.as_posix()}") 

Corrected users data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/users.pkl
Corrected profiles data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/profiles.pkl
Corrected cards data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/cards.pkl
Corrected orders data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/orders.pkl
Corrected order details data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/order_details.pkl
Corrected reviews data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/reviews.pkl
Corrected laballed reviews data saved in C:/Users/davyt/Desktop/Bicocca/Marketing Analytics/unimib_snowit_project/data_loaded/reviews_labelled.pkl
