# **Data Cleaning-**

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

city = pd.read_csv("../data/City.csv")
continent = pd.read_csv("../data/Continent.csv")
country = pd.read_csv("../data/Country.csv")
item = pd.read_csv("../data/Item.csv")
mode = pd.read_csv("../data/Mode.csv")
region = pd.read_csv("../data/Region.csv")
transaction = pd.read_csv("../data/Transaction.csv")
type_df = pd.read_csv("../data/Type.csv")
user = pd.read_csv("../data/User.csv")
updated_item = pd.read_csv("../data/updated_item.csv")

Transaction

In [2]:
print("HEAD")
print(transaction.head())

print("\nINFO")
transaction.info()

print("\nMISSING VALUES")
print(transaction.isnull().sum())

print("\nDUPLICATES")
print(transaction.duplicated().sum())

HEAD
   TransactionId  UserId  VisitYear  VisitMonth  VisitMode  AttractionId  \
0              3   70456       2022          10          2           640   
1              8    7567       2022          10          4           640   
2              9   79069       2022          10          3           640   
3             10   31019       2022          10          3           640   
4             15   43611       2022          10          2           640   

   Rating  
0       5  
1       5  
2       5  
3       3  
4       3  

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52930 entries, 0 to 52929
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   TransactionId  52930 non-null  int64
 1   UserId         52930 non-null  int64
 2   VisitYear      52930 non-null  int64
 3   VisitMonth     52930 non-null  int64
 4   VisitMode      52930 non-null  int64
 5   AttractionId   52930 non-null  int64
 6   Rating     

Mode

In [3]:
print("HEAD")
print(mode.head())

print("\nINFO")
mode.info()

print("\nMISSING VALUES")
print(mode.isnull().sum())

print("\nDUPLICATES")
print(mode.duplicated().sum())

HEAD
   VisitModeId VisitMode
0            0         -
1            1  Business
2            2   Couples
3            3    Family
4            4   Friends

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   VisitModeId  6 non-null      int64 
 1   VisitMode    6 non-null      object
dtypes: int64(1), object(1)
memory usage: 228.0+ bytes

MISSING VALUES
VisitModeId    0
VisitMode      0
dtype: int64

DUPLICATES
0


In [4]:
transaction = transaction.merge(
    mode,
    left_on="VisitMode",
    right_on="VisitModeId",
    how="left"
)

In [5]:
print(transaction.head())

   TransactionId  UserId  VisitYear  VisitMonth  VisitMode_x  AttractionId  \
0              3   70456       2022          10            2           640   
1              8    7567       2022          10            4           640   
2              9   79069       2022          10            3           640   
3             10   31019       2022          10            3           640   
4             15   43611       2022          10            2           640   

   Rating  VisitModeId VisitMode_y  
0       5            2     Couples  
1       5            4     Friends  
2       5            3      Family  
3       3            3      Family  
4       3            2     Couples  


In [6]:
# Drop unnecessary numeric visit mode columns
transaction = transaction.drop(columns=["VisitMode_x", "VisitModeId"])

# Rename the correct label column
transaction = transaction.rename(columns={"VisitMode_y": "VisitMode"})

In [7]:
print(transaction.columns)
print(transaction.head())
print(transaction['VisitMode'].unique())

Index(['TransactionId', 'UserId', 'VisitYear', 'VisitMonth', 'AttractionId',
       'Rating', 'VisitMode'],
      dtype='object')
   TransactionId  UserId  VisitYear  VisitMonth  AttractionId  Rating  \
0              3   70456       2022          10           640       5   
1              8    7567       2022          10           640       5   
2              9   79069       2022          10           640       5   
3             10   31019       2022          10           640       3   
4             15   43611       2022          10           640       3   

  VisitMode  
0   Couples  
1   Friends  
2    Family  
3    Family  
4   Couples  
['Couples' 'Friends' 'Family' 'Solo' 'Business']


User

In [8]:
print("HEAD")
print(user.head())

print("\nINFO")
user.info()

print("\nMISSING VALUES")
print(user.isnull().sum())

print("\nDUPLICATES")
print(user.duplicated().sum())

HEAD
   UserId  ContinentId  RegionId  CountryId  CityId
0      14            5        20        155   220.0
1      16            3        14        101  3098.0
2      20            4        15        109  4303.0
3      23            1         4         22   154.0
4      25            3        14        101  3098.0

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33530 entries, 0 to 33529
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   UserId       33530 non-null  int64  
 1   ContinentId  33530 non-null  int64  
 2   RegionId     33530 non-null  int64  
 3   CountryId    33530 non-null  int64  
 4   CityId       33526 non-null  float64
dtypes: float64(1), int64(4)
memory usage: 1.3 MB

MISSING VALUES
UserId         0
ContinentId    0
RegionId       0
CountryId      0
CityId         4
dtype: int64

DUPLICATES
0


In [9]:
user = user.dropna(subset=['CityId'])
user['CityId'] = user['CityId'].astype(int)
user.info()
user.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 33526 entries, 0 to 33529
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   UserId       33526 non-null  int64
 1   ContinentId  33526 non-null  int64
 2   RegionId     33526 non-null  int64
 3   CountryId    33526 non-null  int64
 4   CityId       33526 non-null  int64
dtypes: int64(5)
memory usage: 1.5 MB


UserId         0
ContinentId    0
RegionId       0
CountryId      0
CityId         0
dtype: int64

City

In [10]:
print("HEAD")
print(city.head())

print("\nINFO")
city.info()

print("\nMISSING VALUES")
print(city.isnull().sum())

print("\nDUPLICATES")
print(city.duplicated().sum())

HEAD
   CityId      CityName  CountryId
0       0             -          0
1       1        Douala          1
2       2  South Region          1
3       3     N'Djamena          2
4       4        Kigali          3

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9143 entries, 0 to 9142
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CityId     9143 non-null   int64 
 1   CityName   9142 non-null   object
 2   CountryId  9143 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 214.4+ KB

MISSING VALUES
CityId       0
CityName     1
CountryId    0
dtype: int64

DUPLICATES
0


In [11]:
city = city[city['CityId'] != 0]
city = city.dropna(subset=['CityName'])
city['CityName'] = city['CityName'].str.strip()
city['CityName'] = city['CityName'].str.title()

In [12]:
city.info()
city.isnull().sum()
city.head()

<class 'pandas.core.frame.DataFrame'>
Index: 9141 entries, 1 to 9142
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CityId     9141 non-null   int64 
 1   CityName   9141 non-null   object
 2   CountryId  9141 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 285.7+ KB


Unnamed: 0,CityId,CityName,CountryId
1,1,Douala,1
2,2,South Region,1
3,3,N'Djamena,2
4,4,Kigali,3
5,5,Kigali Province,3


Country

In [13]:
print("HEAD")
print(country.head())

print("\nINFO")
country.info()

print("\nMISSING VALUES")
print(country.isnull().sum())

print("\nDUPLICATES")
print(country.duplicated().sum())

HEAD
   CountryId   Country  RegionId
0          0         -         0
1          1  Cameroon         1
2          2      Chad         1
3          3    Rwanda         1
4          4  Ethiopia         2

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CountryId  165 non-null    int64 
 1   Country    165 non-null    object
 2   RegionId   165 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.0+ KB

MISSING VALUES
CountryId    0
Country      0
RegionId     0
dtype: int64

DUPLICATES
0


In [14]:
country = country[country['CountryId'] != 0]
country['Country'] = country['Country'].str.strip()
country['Country'] = country['Country'].str.title()

In [15]:
country.info()
country.head()

<class 'pandas.core.frame.DataFrame'>
Index: 164 entries, 1 to 164
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CountryId  164 non-null    int64 
 1   Country    164 non-null    object
 2   RegionId   164 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 5.1+ KB


Unnamed: 0,CountryId,Country,RegionId
1,1,Cameroon,1
2,2,Chad,1
3,3,Rwanda,1
4,4,Ethiopia,2
5,5,Kenya,2


Region

In [16]:
print("HEAD")
print(region.head())

print("\nINFO")
region.info()

print("\nMISSING VALUES")
print(region.isnull().sum())

print("\nDUPLICATES")
print(region.duplicated().sum())

HEAD
            Region  RegionId  ContinentId
0                -       0.0          0.0
1   Central Africa       1.0          1.0
2      East Africa       2.0          1.0
3     North Africa       3.0          1.0
4  Southern Africa       4.0          1.0

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Region       22 non-null     object 
 1   RegionId     22 non-null     float64
 2   ContinentId  22 non-null     float64
dtypes: float64(2), object(1)
memory usage: 23.5+ KB

MISSING VALUES
Region         977
RegionId       977
ContinentId    977
dtype: int64

DUPLICATES
976


In [17]:
region = region.dropna(how='all')
region = region.dropna()
region = region.drop_duplicates()

In [18]:
region['RegionId'] = region['RegionId'].astype(int)
region['ContinentId'] = region['ContinentId'].astype(int)
region = region[region['RegionId'] != 0]
region['Region'] = region['Region'].str.strip()
region['Region'] = region['Region'].str.title()

In [19]:
region.info()
region.head()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 1 to 21
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Region       21 non-null     object
 1   RegionId     21 non-null     int64 
 2   ContinentId  21 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 672.0+ bytes


Unnamed: 0,Region,RegionId,ContinentId
1,Central Africa,1,1
2,East Africa,2,1
3,North Africa,3,1
4,Southern Africa,4,1
5,West Africa,5,1


Continent

In [20]:
print("HEAD")
print(continent.head())

print("\nINFO")
continent.info()

print("\nMISSING VALUES")
print(continent.isnull().sum())

print("\nDUPLICATES")
print(continent.duplicated().sum())

HEAD
   ContinentId            Continent
0          0.0                    -
1          1.0               Africa
2          2.0              America
3          3.0                 Asia
4          4.0  Australia & Oceania

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ContinentId  6 non-null      float64
 1   Continent    6 non-null      object 
dtypes: float64(1), object(1)
memory usage: 15.7+ KB

MISSING VALUES
ContinentId    993
Continent      993
dtype: int64

DUPLICATES
992


In [21]:
continent = continent.dropna(how='all')
continent = continent.dropna()
continent = continent.drop_duplicates()


In [22]:
continent = continent[continent['ContinentId'] != 0]
continent['ContinentId'] = continent['ContinentId'].astype(int)
continent['Continent'] = continent['Continent'].str.strip()
continent['Continent'] = continent['Continent'].str.title()

In [23]:
continent.info()
continent.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ContinentId  5 non-null      int64 
 1   Continent    5 non-null      object
dtypes: int64(1), object(1)
memory usage: 120.0+ bytes


Unnamed: 0,ContinentId,Continent
1,1,Africa
2,2,America
3,3,Asia
4,4,Australia & Oceania
5,5,Europe


Type

In [24]:
print("HEAD")
print(type_df.head())

print("\nINFO")
type_df.info()

print("\nMISSING VALUES")
print(type_df.isnull().sum())

print("\nDUPLICATES")
print(type_df.duplicated().sum())

HEAD
   AttractionTypeId         AttractionType
0                 2          Ancient Ruins
1                10                Ballets
2                13                Beaches
3                19        Caverns & Caves
4                34  Flea & Street Markets

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   AttractionTypeId  17 non-null     int64 
 1   AttractionType    17 non-null     object
dtypes: int64(1), object(1)
memory usage: 404.0+ bytes

MISSING VALUES
AttractionTypeId    0
AttractionType      0
dtype: int64

DUPLICATES
0


In [25]:
type_df['AttractionType'] = type_df['AttractionType'].str.strip()
type_df['AttractionType'] = type_df['AttractionType'].str.title()

Item

In [26]:
print("HEAD")
print(item.head())

print("\nINFO")
item.info()

print("\nMISSING VALUES")
print(item.isnull().sum())

print("\nDUPLICATES")
print(item.duplicated().sum())

HEAD
   AttractionId  AttractionCityId  AttractionTypeId  \
0         369.0               1.0              13.0   
1         481.0               1.0              13.0   
2         640.0               1.0              63.0   
3         650.0               1.0              13.0   
4         673.0               1.0              13.0   

                       Attraction  \
0               Kuta Beach - Bali   
1                  Nusa Dua Beach   
2  Sacred Monkey Forest Sanctuary   
3                     Sanur Beach   
4                  Seminyak Beach   

                                AttractionAddress  
0                                            Kuta  
1  Semenanjung Nusa Dua, Nusa Dua 80517 Indonesia  
2         Jl. Monkey Forest, Ubud 80571 Indonesia  
3                                           Sanur  
4                                        Seminyak  

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 5 columns):
 #   Column        

In [27]:
item = item.dropna(how='all')
item = item.dropna()
item = item.drop_duplicates()

In [28]:
item['AttractionId'] = item['AttractionId'].astype(int)
item['AttractionCityId'] = item['AttractionCityId'].astype(int)
item['AttractionTypeId'] = item['AttractionTypeId'].astype(int)
item['Attraction'] = item['Attraction'].str.strip()
item['AttractionAddress'] = item['AttractionAddress'].str.strip()

In [29]:
item.info()
item.head()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   AttractionId       30 non-null     int64 
 1   AttractionCityId   30 non-null     int64 
 2   AttractionTypeId   30 non-null     int64 
 3   Attraction         30 non-null     object
 4   AttractionAddress  30 non-null     object
dtypes: int64(3), object(2)
memory usage: 1.4+ KB


Unnamed: 0,AttractionId,AttractionCityId,AttractionTypeId,Attraction,AttractionAddress
0,369,1,13,Kuta Beach - Bali,Kuta
1,481,1,13,Nusa Dua Beach,"Semenanjung Nusa Dua, Nusa Dua 80517 Indonesia"
2,640,1,63,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
3,650,1,13,Sanur Beach,Sanur
4,673,1,13,Seminyak Beach,Seminyak


Updated Item

In [30]:
print("HEAD")
print(updated_item.head())

print("\nINFO")
updated_item.info()

print("\nMISSING VALUES")
print(updated_item.isnull().sum())

print("\nDUPLICATES")
print(updated_item.duplicated().sum())

HEAD
   AttractionId  AttractionCityId AttractionTypeId  \
0           369                 1               13   
1           481                 1               13   
2           640                 1               63   
3           650                 1               13   
4           673                 1               13   

                       Attraction  \
0               Kuta Beach - Bali   
1                  Nusa Dua Beach   
2  Sacred Monkey Forest Sanctuary   
3                     Sanur Beach   
4                  Seminyak Beach   

                                AttractionAddress  
0                                            Kuta  
1  Semenanjung Nusa Dua, Nusa Dua 80517 Indonesia  
2         Jl. Monkey Forest, Ubud 80571 Indonesia  
3                                           Sanur  
4                                        Seminyak  

INFO
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1698 entries, 0 to 1697
Data columns (total 5 columns):
 #   Column            

In [31]:
# Unique IDs from both tables
item_ids = set(item['AttractionId'].dropna().astype(int))
updated_ids = set(updated_item['AttractionId'])

print("Total IDs in Item.csv:", len(item_ids))
print("Total IDs in updated_item.csv:", len(updated_ids))

# IDs in Item but NOT in updated_item
missing_in_updated = item_ids - updated_ids
print("\nIDs present in Item but missing in updated_item:", len(missing_in_updated))

# IDs in updated_item but NOT in Item
extra_in_updated = updated_ids - item_ids
print("IDs present in updated_item but not in Item:", len(extra_in_updated))

Total IDs in Item.csv: 30
Total IDs in updated_item.csv: 1698

IDs present in Item but missing in updated_item: 0
IDs present in updated_item but not in Item: 1668


In [32]:
print("Total rows:", updated_item.shape[0])
print("Unique AttractionId:", updated_item['AttractionId'].nunique())
print("Unique Attraction names:", updated_item['Attraction'].nunique())
duplicate_ids = updated_item['AttractionId'].duplicated().sum()
print("Duplicate AttractionId count:", duplicate_ids)
duplicate_names = updated_item['Attraction'].duplicated().sum()
print("Duplicate Attraction name count:", duplicate_names)

Total rows: 1698
Unique AttractionId: 1698
Unique Attraction names: 1564
Duplicate AttractionId count: 0
Duplicate Attraction name count: 134


In [33]:
updated_item['AttractionTypeId'].value_counts().head(15)

AttractionTypeId
Museum    367
Temple    340
Market    325
Park      318
Beach     318
13          6
72          3
93          2
76          2
63          2
2           2
61          2
91          2
92          1
10          1
Name: count, dtype: int64

In [34]:
# Find rows where AttractionTypeId is purely numeric
numeric_mask = updated_item['AttractionTypeId'].str.isnumeric()

print("Numeric rows:", numeric_mask.sum())

updated_item.loc[numeric_mask, 'AttractionTypeId'] = \
    updated_item.loc[numeric_mask, 'AttractionTypeId'].astype(int)

type_mapping = dict(zip(type_df['AttractionTypeId'], type_df['AttractionType']))

updated_item.loc[numeric_mask, 'AttractionTypeId'] = \
    updated_item.loc[numeric_mask, 'AttractionTypeId'].map(type_mapping)

updated_item = updated_item.rename(
    columns={'AttractionTypeId': 'AttractionType'}
)

updated_item['AttractionType'].value_counts()


Numeric rows: 30


AttractionType
Museum                            367
Temple                            340
Market                            325
Park                              318
Beach                             318
Beaches                             6
Points Of Interest & Landmarks      3
Waterfalls                          2
Religious Sites                     2
Nature & Wildlife Areas             2
Ancient Ruins                       2
National Parks                      2
Volcanos                            2
Water Parks                         1
Ballets                             1
Flea & Street Markets               1
Caverns & Caves                     1
Speciality Museums                  1
Spas                                1
Neighborhoods                       1
Historic Sites                      1
History Museums                     1
Name: count, dtype: int64

In [35]:
item = updated_item.copy()

Merge User + Geography

In [36]:
user_full = user.copy()

In [37]:
user_full = user_full.merge(
    city[['CityId', 'CityName']],
    on='CityId',
    how='left'
)

In [38]:
user_full = user_full.merge(
    country[['CountryId', 'Country']],
    on='CountryId',
    how='left'
)

In [39]:
user_full = user_full.merge(
    region[['RegionId', 'Region']],
    on='RegionId',
    how='left'
)

In [40]:
user_full = user_full.merge(
    continent[['ContinentId', 'Continent']],
    on='ContinentId',
    how='left'
)

In [41]:
print(user_full.shape)
user_full.head()

(33526, 9)


Unnamed: 0,UserId,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent
0,14,5,20,155,220,Lagos,Portugal,Southern Europe,Europe
1,16,3,14,101,3098,Jakarta,Indonesia,South East Asia,Asia
2,20,4,15,109,4303,Gold Coast,Australia,Australia,Australia & Oceania
3,23,1,4,22,154,Meyerton,South Africa,Southern Africa,Africa
4,25,3,14,101,3098,Jakarta,Indonesia,South East Asia,Asia


Merge Transaction + User

In [42]:
master_df = transaction.merge(user_full, on="UserId", how="left")

print("After user merge:", master_df.shape)

After user merge: (52930, 15)


In [43]:
master_df = master_df.merge(item, on="AttractionId", how="left")

print("Final master_df shape:", master_df.shape)
master_df.head()

Final master_df shape: (52930, 19)


Unnamed: 0,TransactionId,UserId,VisitYear,VisitMonth,AttractionId,Rating,VisitMode,ContinentId,RegionId,CountryId,CityId,CityName,Country,Region,Continent,AttractionCityId,AttractionType,Attraction,AttractionAddress
0,3,70456,2022,10,640,5,Couples,5.0,21.0,163.0,4341.0,Guildford,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
1,8,7567,2022,10,640,5,Friends,2.0,8.0,48.0,464.0,Ontario,Canada,Northern America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
2,9,79069,2022,10,640,5,Family,2.0,9.0,54.0,774.0,Brazil,Brazil,South America,America,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
3,10,31019,2022,10,640,3,Family,5.0,17.0,135.0,583.0,Zurich,Switzerland,Central Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"
4,15,43611,2022,10,640,3,Couples,5.0,21.0,163.0,1396.0,Manchester,United Kingdom,Western Europe,Europe,1,Nature & Wildlife Areas,Sacred Monkey Forest Sanctuary,"Jl. Monkey Forest, Ubud 80571 Indonesia"


In [44]:
null_counts = master_df.isnull().sum().sort_values(ascending=False)

print(null_counts)

Region               40
CityId                8
RegionId              8
Continent             8
Country               8
CityName              8
CountryId             8
ContinentId           8
VisitYear             0
TransactionId         0
UserId                0
VisitMonth            0
AttractionId          0
Rating                0
VisitMode             0
AttractionCityId      0
AttractionType        0
Attraction            0
AttractionAddress     0
dtype: int64


In [45]:
master_df = master_df.dropna()

print("Final shape after dropping nulls:", master_df.shape)

Final shape after dropping nulls: (52890, 19)


In [46]:
print(master_df.isnull().sum())

TransactionId        0
UserId               0
VisitYear            0
VisitMonth           0
AttractionId         0
Rating               0
VisitMode            0
ContinentId          0
RegionId             0
CountryId            0
CityId               0
CityName             0
Country              0
Region               0
Continent            0
AttractionCityId     0
AttractionType       0
Attraction           0
AttractionAddress    0
dtype: int64


In [50]:
master_df.to_csv("../data/processed/master_cleaned.csv", index=False)