In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_colwidth', 100)
pd.options.mode.chained_assignment = None  # default='warn'

## Read file

In [3]:
listing_file = pd.read_csv('listings.csv', encoding='iso-8859-1', low_memory=False)

## Delete useless columns

In [4]:
cols = [c for c in listing_file.columns if c.lower()[-3:] != 'url']
listing = listing_file[cols].drop(
    columns=[
        'name', 'host_name', 'scrape_id', 'market', 'host_location', 'smart_location', 
        'country_code', 'country', 'city', 'state', 'neighbourhood_cleansed', 'notes',
        'neighbourhood_group_cleansed', 'neighbourhood', 'transit', 'calendar_last_scraped',
        'calendar_updated', 'street', 'host_neighbourhood', 'last_scraped'
    ]
)
listing = listing.fillna(0)

listing.shape

(24194, 70)

## Replace type with numbers

In [5]:
def make_dict(table, col_name: str) -> dict:
    a_dict = {}
    for b in range(len(table)):
        a_dict.update({table[col_name][b]:table.Num_Represent[b]})
        
    return a_dict

In [6]:
def replacing(col_name, target_dict):
    for i in range(len(listing[col_name])):
        listing[col_name][i] = target_dict[listing[col_name][i]]

In [7]:
def make_table(col_name: str) -> pd.core.frame.DataFrame:
    grouping = listing.groupby([col_name])
    grouping_list = grouping.size().reset_index(name='counts')
    gp_col = grouping_list.sort_values(['counts'], ascending=False)
    
    replace_num = [i + 1 for i in range(len(grouping_list))]
    replace_num.sort(reverse=True)
    gp_col['Num_Represent'] = replace_num
    return gp_col

### Replace bed type

In [8]:
bed_gp_table = make_table('bed_type')
bed_gp_table

Unnamed: 0,bed_type,counts,Num_Represent
4,Real Bed,24051,5
2,Futon,69,4
3,Pull-out Sofa,54,3
0,Airbed,11,2
1,Couch,9,1


In [9]:
bed_dict = make_dict(bed_gp_table, 'bed_type')
bed_dict

{'Airbed': 2, 'Couch': 1, 'Futon': 4, 'Pull-out Sofa': 3, 'Real Bed': 5}

In [10]:
replacing('bed_type', bed_dict)

In [11]:
listing['bed_type']

0        5
1        4
2        5
3        5
4        5
        ..
24189    5
24190    5
24191    5
24192    5
24193    5
Name: bed_type, Length: 24194, dtype: object

### Replace room type

In [12]:
room_gp_table = make_table('room_type')
room_gp_table

Unnamed: 0,room_type,counts,Num_Represent
0,Entire home/apt,14868,4
2,Private room,8454,3
1,Hotel room,436,2
3,Shared room,436,1


In [13]:
room_dict = make_dict(room_gp_table, 'room_type')
room_dict

{'Entire home/apt': 4, 'Hotel room': 2, 'Private room': 3, 'Shared room': 1}

In [14]:
replacing('room_type', room_dict)

In [15]:
listing['room_type']

0        3
1        3
2        4
3        3
4        3
        ..
24189    1
24190    1
24191    1
24192    3
24193    4
Name: room_type, Length: 24194, dtype: object

### Replace property type

In [16]:
property_gp_table = make_table('property_type')
property_gp_table

Unnamed: 0,property_type,counts,Num_Represent
1,Apartment,13049,37
22,House,6352,36
33,Townhouse,1530,35
13,Condominium,559,34
30,Serviced apartment,516,33
36,Villa,381,32
19,Guesthouse,323,31
18,Guest suite,305,30
14,Cottage,214,29
3,Bed and breakfast,199,28


In [17]:
property_dict = make_dict(property_gp_table, 'property_type')
property_dict

{'Aparthotel': 17,
 'Apartment': 37,
 'Barn': 9,
 'Bed and breakfast': 28,
 'Boat': 13,
 'Boutique hotel': 25,
 'Bungalow': 27,
 'Cabin': 21,
 'Camper/RV': 18,
 'Campsite': 14,
 'Casa particular (Cuba)': 2,
 'Castle': 6,
 'Chalet': 8,
 'Condominium': 34,
 'Cottage': 29,
 'Dome house': 10,
 'Earth house': 16,
 'Farm stay': 24,
 'Guest suite': 30,
 'Guesthouse': 31,
 'Hostel': 23,
 'Hotel': 19,
 'House': 36,
 'Houseboat': 5,
 'Hut': 11,
 'Kezhan (China)': 4,
 'Loft': 26,
 'Minsu (Taiwan)': 3,
 'Nature lodge': 12,
 'Other': 22,
 'Serviced apartment': 33,
 'Tent': 15,
 'Tiny house': 20,
 'Townhouse': 35,
 'Train': 1,
 'Treehouse': 7,
 'Villa': 32}

In [18]:
replacing('property_type', property_dict)

In [19]:
listing['property_type']

0        36
1        37
2        37
3        36
4        36
         ..
24189    36
24190    36
24191    36
24192    35
24193    37
Name: property_type, Length: 24194, dtype: object

## Convert string

### Delete '$'

In [20]:
for c in listing:
    try: 
        listing[c] = listing[c].str.replace('$', '')
    except AttributeError:
        pass

### Convert f to 0 and t to 1, nan to 0

In [22]:
amount = len(listing)
for tf in listing:
    if 'is' or 'profile_pic' or 'identity_verified' in str(tf):
        print(f"Processing column {tf}")
        for i in range(amount):
            if listing[tf][i] == 'f':
                listing[tf][i] = 0
            elif listing[tf][i] == 't':
                listing[tf][i] = 1
            else:
                listing[tf][i] = 0


Processing id
Processing experiences_offered


KeyboardInterrupt: 

In [None]:
cancel = make_table('cancellation_policy')
cancel

In [None]:
listing