In [40]:
import pandas as pd
import numpy as np
import os

In [41]:
def merge_csv_files(folder_path, output_file):
    csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

    merged_data = pd.DataFrame()

    for file in csv_files:
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        merged_data = pd.concat([merged_data, df], ignore_index=True)

    merged_data.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}.")

In [42]:
if __name__ == "__main__":
    data_folder = "MagnaOpus_GitHub/transform/housing1"
    output_csv_file = "MagnaOpus_GitHub/transform/merged/housing1_merged.csv"
    merge_csv_files(data_folder, output_csv_file)

Merged data saved to /Users/aryrubi/Desktop/Python_projects/MagnaOpus/cleaning/merged/housing1_merged.csv.


In [43]:
df = pd.read_csv(output_csv_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247 entries, 0 to 246
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Precio         247 non-null    object
 1   URLs           247 non-null    object
 2   Detalle1       247 non-null    object
 3   Detalle2       247 non-null    object
 4   Detalle3       247 non-null    object
 5   Detalle4       247 non-null    object
 6   Detalle5       245 non-null    object
 7   Detalle6       239 non-null    object
 8   Detalle7       227 non-null    object
 9   Detalle8       196 non-null    object
 10  Detalle9       84 non-null     object
 11  Info_basica1   247 non-null    object
 12  Info_basica2   247 non-null    object
 13  Info_basica3   238 non-null    object
 14  Info_basica4   234 non-null    object
 15  Info_basica5   219 non-null    object
 16  Info_basica6   172 non-null    object
 17  Info_basica7   112 non-null    object
 18  Info_basica8   96 non-null    

In [44]:
new_column_names = {'Detalle1': 'Tipo de Inmueble',
                    'Detalle2': 'Ubicación',
                    'Detalle3': 'Total construido',
                    'Detalle4': 'Dormitorios',
                    'Detalle5': "Baños",
                    'Detalle6': 'Antiguedad',
                    'Detalle7': 'Superficie cubierta',
                    'Detalle8': 'Plantas',
                    'Detalle9': '(REF.',
                    'Info_basica1': 'Espacios',
                    'Info_basica2': 'Recámaras',
                    'Info_basica3': 'Baños',
                    'Info_basica4': 'Medios baños',
                    'Info_basica5': 'Estacionamientos',
                    'Info_basica6': 'Zonificación',
                    'Info_basica7': 'Pisos',
                    'Info_basica8': 'Antigüedad',
                    'Info_basica9': 'Situación',
                    'Info_basica10': 'Mantenimiento',
                    'Info_basica11': 'Orientación',
                    'Info_basica12': 'Disposición',
                    'Info_basica13': 'Condición'}
df = df.rename(columns=new_column_names)


In [45]:
def organize_data(df):
    data_list = []
    max_length = 0

    for index, row in df.iterrows():
        new_data = {}
        for column in df.columns:
            new_data[column] = []

        for column, value in row.items():
            if isinstance(value, str):
                for new_column in new_data.keys():
                    if value.startswith(new_column):
                        new_data[new_column].append(value.replace(new_column, '').strip())
                        break
                else:
                    new_data[column].append(value)
            else:
                new_data[column].append(value if isinstance(value, list) else [value] * max_length)

        data_list.append(new_data)

    return pd.DataFrame(data_list)



In [46]:
def extract_single_value_or_none(df):
    """
    Extract the single value from each list or assign None if the list is empty in each column of the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame to extract single values from.

    Returns:
        pd.DataFrame: A new DataFrame with single values extracted or None if the list is empty.
    """
    extracted_data = {}

    for column in df.columns:
        extracted_data[column] = df[column].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

    return pd.DataFrame(extracted_data)


In [47]:
organized_df = organize_data(df)
extracted_df = extract_single_value_or_none(organized_df)

In [48]:
extracted_df.head()

Unnamed: 0,Precio,URLs,Tipo de Inmueble,Ubicación,Total construido,Dormitorios,Baños,Antiguedad,Superficie cubierta,Plantas,...,Estacionamientos,Zonificación,Pisos,Antigüedad,Situación,Mantenimiento,Orientación,Disposición,Condición,Detalle10
0,"USD288,000",https://www.csbienesraices.com/p/4520734-Casa-...,Casa,Castaños Sur,269 m²,3,2,A Estrenar,269 m²,1,...,: 3,: D3,: 1,: A Estrenar,: Vacía,: $ 0,: Oeste,: Frente,[],[]
1,"USD395,000",https://www.csbienesraices.com/p/4671889-Casa-...,Casa,Cerro Azul,289 m²,3,3,A Estrenar,257 m²,2,...,: 4,: UR,: 2,: A Estrenar,: Vacía,: $ 75,: Sur,: Frente,[],[]
2,"USD635,000",https://www.csbienesraices.com/p/5122560-Casa-...,Casa,Lomas del Mayab,456 m²,4,4,A Estrenar,456 m²,2,...,: 2,: R1,: 2,: A Estrenar,,: $ 0,[],[],[],[]
3,"USD600,000",https://www.csbienesraices.com/p/5175000-Casa-...,Casa,Florencia Norte,526 m²,4,4,30,526 m²,2,...,: 4,: R1,: 2,: 30 Años,: Habitada,: $ 0,: Sur,: Frente,: Bueno,[]
4,"USD115,500",https://www.csbienesraices.com/p/5160319-Casa-...,Casa,Tegucigalpa,105 m²,3,2,A Estrenar,,2,...,: 1,: R2,: 2,: A Estrenar,,: $ 0,[],[],: Excelente,[]


In [49]:
df_2 = extracted_df.drop(columns=['URLs', 'Recámaras', 'Tipo de Inmueble', 'Antiguedad', 'Superficie cubierta', 'Plantas', 'Espacios', 'Zonificación', 'Pisos', 'Antigüedad', 'Situación', 'Mantenimiento', 'Orientación', 'Disposición', 'Condición', 'Detalle10'])

In [50]:
df_2.isna().sum()


Precio               0
Ubicación            0
Total construido     8
Dormitorios         34
Baños                6
(REF.                0
Medios baños        62
Estacionamientos    39
dtype: int64

In [51]:
fill_values = {
    'Medios baños': float(0),
    'Estacionamientos': float(0)
}

# Fill specific columns with their corresponding fill values
final_df = df_2.fillna(fill_values, inplace=False)
final_df.isna().sum()

Precio               0
Ubicación            0
Total construido     8
Dormitorios         34
Baños                6
(REF.                0
Medios baños         0
Estacionamientos     0
dtype: int64

In [52]:
clean_df = final_df.dropna()
clean_df = clean_df[clean_df['Precio'] != 'N.A']

In [53]:
clean_df['(REF.'] = clean_df['(REF.'].str.replace(r'\(|\)', '')
clean_df['Precio'] = clean_df['Precio'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Total construido'] = clean_df['Total construido'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Dormitorios'] = clean_df['Dormitorios'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Baños'] = clean_df['Baños'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Medios baños'] = clean_df['Medios baños'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)
clean_df['Estacionamientos'] = clean_df['Estacionamientos'].str.replace(r'[^0-9.-]', '', regex=True).astype(float)

  clean_df['(REF.'] = clean_df['(REF.'].str.replace(r'\(|\)', '')


In [54]:
fill_values = {
    'Medios baños': float(0),
    'Estacionamientos': float(0)
}

# Fill specific columns with their corresponding fill values
clean_df = clean_df.fillna(fill_values, inplace=False)

In [55]:
#clean_df['Medios baños'] = pd.to_numeric(clean_df['Medios baños'], errors='coerce')

# Transform values in 'Medios baños' by dividing them by 2
clean_df['Medios baños_Transformed'] = np.divide(clean_df['Medios baños'], 2)

# Sum two specific columns: 'Column1' and 'Column2'
clean_df['Total_Baños'] = clean_df['Baños'] + clean_df['Medios baños_Transformed']

final_df = clean_df.drop(columns=['Medios baños', 'Baños', 'Medios baños_Transformed'])


In [56]:
final_df.rename(columns={'(REF.': 'ID', 'Precio': 'usd_price', 'Ubicación': 'location', 'Dormitorios': 'bedrooms', 'Total_Baños': 'bathrooms', 'Estacionamientos': 'parking', 'Total construido': 'surface_m²'}, inplace=True)

# Change data types of columns
final_df['usd_price'] = final_df['usd_price'].astype(float)
final_df['bedrooms'] = final_df['bedrooms'].astype(int)
final_df['bathrooms'] = final_df['bathrooms'].astype(float)
final_df['surface_m²'] = final_df['surface_m²'].astype(float)
final_df['parking'] = final_df['parking'].astype(float)

In [57]:
duplicates = final_df[final_df['ID'].duplicated()]

# Dropping rows with duplicated IDs
df_unique = final_df[~final_df['ID'].duplicated()]
df_unique = df_unique.drop(columns=['ID'])

In [58]:
df_unique.to_csv("MagnaOpus_GitHub/cleaning/data/web1_transformed_data.csv", index = False)