In [32]:
import os
import pandas as pd
import re

In [33]:
def read_csv_files_from_folder(folder_path):
    """
    Read all CSV files from a folder and return a list of file names.
    
    Parameters:
        folder_path (str): Path to the folder containing CSV files.
    
    Returns:
        list: List of file names of CSV files in the folder.
    """
    csv_files = []
    # Check if the folder exists
    if os.path.exists(folder_path):
        # Iterate over each file in the folder
        for filename in os.listdir(folder_path):
            # Check if the path is a file (not a subfolder)
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                # Check if the file has a CSV extension
                if filename.lower().endswith('.csv'):
                    csv_files.append(file_path)
    else:
        print("Folder does not exist.")
    return csv_files

folder_path = "/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper"
csv_files = read_csv_files_from_folder(folder_path)
print("CSV files in the folder:")
for csv_file in csv_files:
    print(csv_file)

CSV files in the folder:
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_arizona.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_california.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_colorado.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_district_of_columbia.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_florida.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_illinois.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_indiana.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_maryland.csv
/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper\hotel_dot_com_review_data_massachusetts.csv
/User

In [34]:
def combine_csv_files(folder_path):
    """
    Combine all CSV files from a folder into a single DataFrame.
    
    Parameters:
        folder_path (str): Path to the folder containing CSV files.
    
    Returns:
        pandas.DataFrame: Combined DataFrame containing data from all CSV files.
    """
    all_data = pd.DataFrame()
    # Check if the folder exists
    if os.path.exists(folder_path):
        # Iterate over each file in the folder
        for filename in os.listdir(folder_path):
            # Check if the path is a file (not a subfolder)
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                # Check if the file has a CSV extension
                if filename.lower().endswith('.csv'):
                    # Read the CSV file and extract the region name from the file name
                    region_name = filename.split('_')[-1].split('.')[0]
                    # Assign the region name to a new column in the DataFrame
                    data = pd.read_csv(file_path)
                    data['Region'] = region_name
                    # Append the data to the combined DataFrame
                    all_data = pd.concat([all_data, data], ignore_index=True)
    else:
        print("Folder does not exist.")
    return all_data

# Example usage:
folder_path = "/Users/victo/Desktop/IS3107/Tripadvisor Scrapper/Hotel.com Scraper"
hotel_dot_com_data = combine_csv_files(folder_path)
print("Combined DataFrame:")
print(hotel_dot_com_data)

Combined DataFrame:
                                         Hotel             Overall Rating  \
0                        Hyatt Regency Phoenix    9.0 out of 10 Wonderful   
1                  Kayenta Monument Valley Inn         7.2 out of 10 Good   
2      Home2 Suites by Hilton Phoenix Downtown  9.4 out of 10 Exceptional   
3     Gila River Resorts & Casinos – Vee Quiva    9.2 out of 10 Wonderful   
4                Best Western Downtown Phoenix    8.4 out of 10 Very good   
...                                        ...                        ...   
4754                            Hotel Pentagon         6.4 out of 10 Good   
4755                            District Hotel         6.6 out of 10 Good   
4756                        Duo Housing Hostel    8.0 out of 10 Very good   
4757                           The Baron Hotel         6.8 out of 10 Good   
4758                        Duo Nomad - Hostel         7.8 out of 10 Good   

               Total Reviews Cleanliness Staff and Serv

In [35]:
hotel_dot_com_data.loc[hotel_dot_com_data['Region'] == 'carolina', 'Region'] = 'north carolina'
hotel_dot_com_data.loc[hotel_dot_com_data['Region'] == 'columbia', 'Region'] = 'district of columbia'
hotel_dot_com_data

Unnamed: 0,Hotel,Overall Rating,Total Reviews,Cleanliness,Staff and Service,Amenities,Property Conditions and Facilities,Eco-Friendliness,Region
0,Hyatt Regency Phoenix,9.0 out of 10 Wonderful,"1,611 verified reviews",9.4/10,9.2/10,8.6/10,9.2/10,9.0/10,arizona
1,Kayenta Monument Valley Inn,7.2 out of 10 Good,"1,136 verified reviews",8.0/10,7.4/10,6.4/10,7.0/10,7.2/10,arizona
2,Home2 Suites by Hilton Phoenix Downtown,9.4 out of 10 Exceptional,39 verified reviews,9.8/10,9.6/10,9.6/10,9.6/10,9.6/10,arizona
3,Gila River Resorts & Casinos – Vee Quiva,9.2 out of 10 Wonderful,"1,004 verified reviews",9.6/10,9.2/10,9.0/10,9.4/10,9.2/10,arizona
4,Best Western Downtown Phoenix,8.4 out of 10 Very good,"1,002 verified reviews",8.6/10,8.6/10,8.4/10,8.4/10,8.2/10,arizona
...,...,...,...,...,...,...,...,...,...
4754,Hotel Pentagon,6.4 out of 10 Good,"1,068 verified reviews",6.6/10,7.4/10,6.4/10,6.0/10,6.6/10,washington
4755,District Hotel,6.6 out of 10 Good,"1,002 verified reviews",7.2/10,7.0/10,5.6/10,6.2/10,7.0/10,washington
4756,Duo Housing Hostel,8.0 out of 10 Very good,384 verified reviews,8.0/10,8.2/10,8.0/10,7.8/10,8.0/10,washington
4757,The Baron Hotel,6.8 out of 10 Good,"1,002 verified reviews",7.4/10,7.2/10,5.8/10,6.2/10,5.8/10,washington


In [36]:
def map_region_shortform(region):
    if region.lower().strip() == 'district of columbia':
        return 'DC'
    elif region.lower().strip() == 'newyork':
        return 'NY'
    elif region.lower().strip() == 'arizona':
        return 'AZ'
    elif region.lower().strip() == 'california':
        return 'CA'
    elif region.lower().strip() == 'texas':
        return 'TX'
    elif region.lower().strip() == 'illinois':
        return 'IL'
    elif region.lower().strip() == 'pennsylvania':
        return 'PA'
    elif region.lower().strip() == 'florida':
        return 'FL'
    elif region.lower().strip() == 'indiana':
        return 'IN'
    elif region.lower().strip() == 'ohio':
        return 'OH'
    elif region.lower().strip() == 'michigan':
        return 'MI'
    elif region.lower().strip() == 'north carolina':
        return 'NC'
    elif region.lower().strip() == 'tennessee':
        return 'TN'
    elif region.lower().strip() == 'washington':
        return 'WA'
    elif region.lower().strip() == 'massachusetts':
        return 'MA'
    elif region.lower().strip() == 'maryland':
        return 'MD'
    elif region.lower().strip() == 'colorado':
        return 'CO'
    
    else:
        return None


In [37]:
hotel_dot_com_data['region_shortform'] = hotel_dot_com_data['Region'].apply(map_region_shortform)

In [38]:
hotel_dot_com_data

Unnamed: 0,Hotel,Overall Rating,Total Reviews,Cleanliness,Staff and Service,Amenities,Property Conditions and Facilities,Eco-Friendliness,Region,region_shortform
0,Hyatt Regency Phoenix,9.0 out of 10 Wonderful,"1,611 verified reviews",9.4/10,9.2/10,8.6/10,9.2/10,9.0/10,arizona,AZ
1,Kayenta Monument Valley Inn,7.2 out of 10 Good,"1,136 verified reviews",8.0/10,7.4/10,6.4/10,7.0/10,7.2/10,arizona,AZ
2,Home2 Suites by Hilton Phoenix Downtown,9.4 out of 10 Exceptional,39 verified reviews,9.8/10,9.6/10,9.6/10,9.6/10,9.6/10,arizona,AZ
3,Gila River Resorts & Casinos – Vee Quiva,9.2 out of 10 Wonderful,"1,004 verified reviews",9.6/10,9.2/10,9.0/10,9.4/10,9.2/10,arizona,AZ
4,Best Western Downtown Phoenix,8.4 out of 10 Very good,"1,002 verified reviews",8.6/10,8.6/10,8.4/10,8.4/10,8.2/10,arizona,AZ
...,...,...,...,...,...,...,...,...,...,...
4754,Hotel Pentagon,6.4 out of 10 Good,"1,068 verified reviews",6.6/10,7.4/10,6.4/10,6.0/10,6.6/10,washington,WA
4755,District Hotel,6.6 out of 10 Good,"1,002 verified reviews",7.2/10,7.0/10,5.6/10,6.2/10,7.0/10,washington,WA
4756,Duo Housing Hostel,8.0 out of 10 Very good,384 verified reviews,8.0/10,8.2/10,8.0/10,7.8/10,8.0/10,washington,WA
4757,The Baron Hotel,6.8 out of 10 Good,"1,002 verified reviews",7.4/10,7.2/10,5.8/10,6.2/10,5.8/10,washington,WA


In [39]:
def transform_hotel_dot_com_data(data):
    new_column_names = {'Hotel': 'hotel_name', 
                        'Overall Rating': 'overall_rating', 
                        'Total Reviews': 'total_num_of_reviews',
                        'Cleanliness': 'cleanliness',
                        'Staff and Service': 'service',
                        'Region': 'region',
                        'Property Conditions and Facilities': 'facilities'
                       }
    data = data.rename(columns=new_column_names)
    
    columns_to_drop = ['Amenities', 'Eco-Friendliness']
    data = data.drop(columns=columns_to_drop)
    
    def extract_first_number(text):
        return text.split('/')[0]
    
    def extract_numeric_value(text):
        # Use regular expression to extract digits and commas
        numeric_part = re.search(r'\d{1,3}(,\d{3})*', text)
        if numeric_part:
            # Remove commas and convert to integer
            return int(numeric_part.group(0).replace(',', ''))
        else:
            return None
        
    def extract_numeric_rating(text):
        numeric_rating = re.search(r'\d+\.\d+', text)
        if numeric_rating:
            return float(numeric_rating.group())
        else:
            return None
    
    data['cleanliness'] = data['cleanliness'].apply(extract_first_number)
    data['service'] = data['service'].apply(extract_first_number)
    data['facilities'] = data['facilities'].apply(extract_first_number)
    data['location'] = None
    data['value'] = None
    data['total_num_of_reviews'] = data['total_num_of_reviews'].apply(extract_numeric_value)
    data['overall_rating'] = data['overall_rating'].apply(extract_numeric_rating)
    
    # Rearranging columns
    data = data[['hotel_name', 'overall_rating', 'total_num_of_reviews' ,'region', 'region_shortform', 'location', 'value', 'facilities', 'cleanliness']]

    return data
    

    

In [40]:
hotel_dot_com_data = transform_hotel_dot_com_data(hotel_dot_com_data)
print(hotel_dot_com_data)
# Save the combined data as a CSV file
hotel_dot_com_data.to_csv('hotel_dot_com_data.csv', index=False)


                                    hotel_name  overall_rating  \
0                        Hyatt Regency Phoenix             9.0   
1                  Kayenta Monument Valley Inn             7.2   
2      Home2 Suites by Hilton Phoenix Downtown             9.4   
3     Gila River Resorts & Casinos – Vee Quiva             9.2   
4                Best Western Downtown Phoenix             8.4   
...                                        ...             ...   
4754                            Hotel Pentagon             6.4   
4755                            District Hotel             6.6   
4756                        Duo Housing Hostel             8.0   
4757                           The Baron Hotel             6.8   
4758                        Duo Nomad - Hostel             7.8   

      total_num_of_reviews      region region_shortform location value  \
0                     1611     arizona               AZ     None  None   
1                     1136     arizona               AZ    