In [31]:
import pandas as pd
import numpy as np
import os

In [32]:
def merge_csv_files(folder_path, output_file):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    merged_data = pd.DataFrame()

    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        merged_data = pd.concat([merged_data, df], ignore_index=True)

    merged_data.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}.")

In [33]:
if __name__ == "__main__":
    data_folder = "MagnaOpus_GitHub/transform/housing2"
    output_csv_file = "MagnaOpus_GitHub/transform/merged/housing2_merged.csv"
    merge_csv_files(data_folder, output_csv_file)

Merged data saved to /Users/aryrubi/Desktop/Python_projects/MagnaOpus/cleaning/merged/housing2_merged.csv.


In [34]:
df = pd.read_csv(output_csv_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142 entries, 0 to 141
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Precio        142 non-null    object
 1   URLs          142 non-null    object
 2   Detalle1      142 non-null    object
 3   Detalle2      142 non-null    object
 4   Detalle3      142 non-null    object
 5   Detalle4      142 non-null    object
 6   Detalle5      142 non-null    object
 7   Detalle6      141 non-null    object
 8   Detalle7      131 non-null    object
 9   Detalle8      93 non-null     object
 10  Detalle9      13 non-null     object
 11  Info_basica1  142 non-null    object
 12  Info_basica2  142 non-null    object
 13  Info_basica3  142 non-null    object
 14  Info_basica4  140 non-null    object
 15  Info_basica5  138 non-null    object
 16  Info_basica6  126 non-null    object
 17  Info_basica7  116 non-null    object
 18  Info_basica8  71 non-null     object
 19  Info_bas

In [35]:
new_column_names = {'Detalle1': 'Tipo de Inmueble',
                    'Detalle2': 'Ubicación',
                    'Detalle3': 'Total construido',
                    'Detalle4': 'Dormitorios',
                    'Detalle5': "Baños",
                    'Detalle6': 'Antiguedad',
                    'Detalle7': 'Superficie cubierta',
                    'Detalle8': 'Plantas',
                    'Detalle9': '(REF.',
                    'Info_basica1': 'Espacios',
                    'Info_basica2': 'Recámaras',
                    'Info_basica3': 'Baños',
                    'Info_basica4': 'Medios baños',
                    'Info_basica5': 'Estacionamientos',
                    'Info_basica6': 'Pisos',
                    'Info_basica7': 'Antigüedad',
                    'Info_basica8': 'Mantenimiento',
                    'Info_basica9': 'Condición',
                    'Info_basica10': 'Situación',
                    'Info_basica11': 'Condición'}
df = df.rename(columns=new_column_names)


In [36]:
def organize_data(df):
    data_list = []
    max_length = 0

    for index, row in df.iterrows():
        new_data = {}
        for column in df.columns:
            new_data[column] = []

        for column, value in row.items():
            if isinstance(value, str):
                for new_column in new_data.keys():
                    if value.startswith(new_column):
                        new_data[new_column].append(value.replace(new_column, '').strip())
                        break
                else:
                    new_data[column].append(value)
            else:
                new_data[column].append(value if isinstance(value, list) else [value] * max_length)

        data_list.append(new_data)

    return pd.DataFrame(data_list)



In [37]:
def extract_single_value_or_none(df):
    """
    Extract the single value from each list or assign None if the list is empty in each column of the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to extract single values from.

    Returns:
        pd.DataFrame: A new DataFrame with single values extracted or None if the list is empty.
    """
    extracted_data = {}

    for column in df.columns:
        extracted_data[column] = df[column].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

    return pd.DataFrame(extracted_data)


In [38]:
organized_df = organize_data(df)
extracted_df = extract_single_value_or_none(organized_df)

In [39]:
extracted_df.head()

Unnamed: 0,Precio,URLs,Tipo de Inmueble,Ubicación,Total construido,Dormitorios,Baños,Antiguedad,Superficie cubierta,Plantas,(REF.,Espacios,Recámaras,Medios baños,Estacionamientos,Pisos,Antigüedad,Mantenimiento,Condición
0,"USD275,000",https://www.innovabienesraiceshonduras.com/p/2...,Casa,Tres Caminos,366 m²,3,3,A Estrenar,,[],1509),: 4,: 3,: 1,: 2,,: A Estrenar,: $ 0,[]
1,"USD404,695",https://www.innovabienesraiceshonduras.com/p/4...,Casa,Lomas del Mayab,518 m²,3,2,14,,2,2055),: 3,: 3,: 1,: 6,: 2,: 14 Años,: $ 0,[]
2,"USD976,000",https://www.innovabienesraiceshonduras.com/p/1...,Casa,Tepeyac,620 m²,5,5,32,,3,1214),: 6,: 5,: 1,: 2,: 3,: 32 Años,: $ 0,[]
3,"USD260,000",https://www.innovabienesraiceshonduras.com/p/1...,Casa,Res. El Trapiche,218 m²,3,2,8,218 m²,2,301),: 3,: 3,: 1,: 2,: 2,: 8 Años,: $ 0,[]
4,"USD270,000",https://www.innovabienesraiceshonduras.com/p/1...,Casa,Palma Real,329 m²,3,4,20,,2,229),: 5,: 3,,: 2,: 2,: 20 Años,: $ 0,[]


In [40]:
df_2 = extracted_df.drop(columns=['URLs', 'Recámaras', 'Tipo de Inmueble', 'Antiguedad', 'Superficie cubierta', 'Plantas', 'Espacios', 'Pisos', 'Antigüedad', 'Mantenimiento', 'Condición'])

In [41]:
df_2.isna().sum()


Precio               0
Ubicación            0
Total construido    16
Dormitorios          3
Baños                4
(REF.                0
Medios baños        22
Estacionamientos    23
dtype: int64

In [42]:
fill_values = {
    'Medios baños': float(0),
    'Estacionamientos': float(0)
}

# Fill specific columns with their corresponding fill values
final_df = df_2.fillna(fill_values, inplace=False)
final_df.isna().sum()

Precio               0
Ubicación            0
Total construido    16
Dormitorios          3
Baños                4
(REF.                0
Medios baños         0
Estacionamientos     0
dtype: int64

In [43]:
clean_df = final_df.dropna()
clean_df = clean_df[clean_df['Precio'] != 'N.A']

In [44]:
clean_df['(REF.'] = clean_df['(REF.'].str.replace(r'\(|\)', '')
clean_df['Precio'] = clean_df['Precio'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Total construido'] = clean_df['Total construido'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Dormitorios'] = clean_df['Dormitorios'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Baños'] = clean_df['Baños'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Medios baños'] = clean_df['Medios baños'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Estacionamientos'] = clean_df['Estacionamientos'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)

  clean_df['(REF.'] = clean_df['(REF.'].str.replace(r'\(|\)', '')


In [45]:
fill_values = {
    'Medios baños': float(0),
    'Estacionamientos': float(0)
}

# Fill specific columns with their corresponding fill values
clean_df = clean_df.fillna(fill_values, inplace=False)

In [46]:

# Transform values in 'Medios baños' by dividing them by 2
clean_df['Medios baños_Transformed'] = np.divide(clean_df['Medios baños'], 2)

# Sum two specific columns: 'Column1' and 'Column2'
clean_df['Total_Baños'] = clean_df['Baños'] + clean_df['Medios baños_Transformed']

final_df = clean_df.drop(columns=['Medios baños', 'Baños', 'Medios baños_Transformed'])


In [47]:
final_df.rename(columns={'(REF.': 'ID', 'Precio': 'usd_price', 'Ubicación': 'location', 'Dormitorios': 'bedrooms', 'Total_Baños': 'bathrooms', 'Estacionamientos': 'parking', 'Total construido': 'surface_m²'}, inplace=True)

# Change data types of columns
final_df['usd_price'] = final_df['usd_price'].astype(float)
final_df['bedrooms'] = final_df['bedrooms'].astype(int)
final_df['bathrooms'] = final_df['bathrooms'].astype(float)
final_df['surface_m²'] = final_df['surface_m²'].astype(float)
final_df['parking'] = final_df['parking'].astype(float)

In [48]:
duplicates = final_df[final_df['ID'].duplicated()]

# Dropping rows with duplicated IDs
df_unique = final_df[~final_df['ID'].duplicated()]
df_unique = df_unique.drop(columns=['ID'])

In [49]:
df_unique.to_csv("MagnaOpus_GitHub/cleaning/data/web2_transformed_data.csv", index = False)