## ANALYSIS OF THE DATASET



In [200]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import ast


In [201]:
# Get the parent directory of the current working directory and append 'dataset'
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'dataset'))

# Construct the paths for 'test.csv' and 'train.csv'
test_file = os.path.abspath(os.path.join(parent_dir, 'test.csv'))
train_file = os.path.abspath(os.path.join(parent_dir, 'train.csv'))

# Print the paths of the different files
print("Test file path:", test_file)
print("Train file path:", train_file)


Test file path: c:\Users\eloip\Documents\datathon_2024\Dream_Team_2024\dataset\test.csv
Train file path: c:\Users\eloip\Documents\datathon_2024\Dream_Team_2024\dataset\train.csv


In [202]:
df_test = pd.read_csv(test_file, sep=',', low_memory=False)
df_train = pd.read_csv(train_file, sep=',', low_memory=False)


In [203]:
# List of dataframes and their corresponding names
dataframes = [(df_train, 'df_train'), (df_test, 'df_test')]

diff = {'df_test': [], 'df_train': []}

# Iterate through each DataFrame and save column info to a markdown file
for df, df_name in dataframes:
    with open(f'{df_name}_column_info.md', 'w') as file:
        # Write the header for the markdown table
        file.write("| Column Name | Non-null Count | Proportion of Non-null | number of unique\n")
        file.write("|--------------|----------------|------------------------| ------------------\n")
        
        # Iterate through each column and write the count and proportion of non-null values
        total_rows = len(df)  # Total number of rows in the DataFrame
        for column in df.columns:
            non_null_count = df[column].notnull().sum()  # Count of non-null values
            non_null_proportion = non_null_count / total_rows  # Proportion of non-null values
            unique_count = df[column].nunique()  # Number of unique values

            # Write the row to the markdown file
            diff[df_name].append(non_null_proportion)
            file.write(f"| {column} | {non_null_count} | {non_null_proportion:.4f} | {unique_count} |\n")

    print(f"Data has been written to '{df_name}_column_info.md'")



with open(f'column_difference.md', 'w') as file:
    file.write("| Column Name | Non-null Difference |\n")
    file.write("|--------------|----------------|\n")
    for i, col in enumerate(df_test):
        file.write(f"| {col} | {abs(diff['df_test'][i] - diff['df_train'][i])}|\n")

Data has been written to 'df_train_column_info.md'
Data has been written to 'df_test_column_info.md'


In [204]:
#Donat un percentatge dona el nombre de files que 
# tenen més d'aquest percentatge de cols not null 
for df, df_name in dataframes:
    threshold = 0.7 # Convert percentage to proportion
    non_null_fraction_per_row = df.notnull().mean(axis=1)  # Fraction of non-null columns for each row
    rows_above_threshold = (non_null_fraction_per_row > threshold).sum()  # Count rows above the threshold
    print(rows_above_threshold)

91576
19139


In [205]:
def convert_columns_to_int(df):
    for col in df.columns:
        try:
            # Try to convert the column to integers
            df[col] = df[col].astype(np.int64)
        except ValueError:
            try:
                df[col] = df[col].astype(np.float32)
            except:
                #not possible:
                pass
            pass
    return df

for df, df_name in dataframes:
    df = convert_columns_to_int(df)


In [206]:

# Function to convert string representations to actual lists
def string_to_list(input_string):
    if isinstance(input_string, str):  # Only process strings
        try:
            result = ast.literal_eval(input_string)
            return result
        except (ValueError, SyntaxError):
            return []  # Return empty list if parsing fails
    return input_string  # Return original value if not a string

# Function to create one-hot encoding from a column of lists
def one_hot_from_list(df, column_name):
    # Replace NaN or invalid entries with empty lists
    df[column_name] = df[column_name].apply(lambda x: x if isinstance(x, list) else [])
    
    # Get all unique elements from the lists in the specified column
    unique_elements = set(element for lst in df[column_name] for element in lst)
    # Iterate through each unique element and create a one-hot encoded column
    for element in unique_elements:
        one_hot_col_name = f"one_hot_{element}"
        df[one_hot_col_name] = df[column_name].apply(lambda lst: 1 if element in lst else 0)

    return df

# Example Usage
columns_to_one_hot = ["Characteristics.LotFeatures"]
rest_not_used = ["ImageData.features_reso.results","ImageData.room_type_reso.results","Structure.Cooling","Structure.Heating",
                       "Structure.ParkingFeatures"]
                     

for df, df_name in dataframes:
    for col in columns_to_one_hot:
        df[col] = df[col].apply(string_to_list)  # Convert strings to lists
        df = one_hot_from_list(df, col)  # Apply one-hot encoding


In [207]:
def get_column_types(df):
    return df.dtypes

def get_unique_types(df):
    return set(df.dtypes)

def save_dataset(df, filename):
    df.to_csv(filename, index=False)
    print(f"Dataset saved to {filename}")

for df, df_name in dataframes:
    print(len(df.columns.tolist()))
    #print(get_column_types(df))
    print(get_unique_types(df))
    #print(df["ImageData.style.stories.summary.label"])
    p = os.path.abspath(os.path.join(parent_dir, f'{df_name}.csv'))
    save_dataset(df, p)





110
{dtype('int64'), dtype('O'), dtype('float32')}
Dataset saved to c:\Users\eloip\Documents\datathon_2024\Dream_Team_2024\dataset\df_train.csv
109
{dtype('int64'), dtype('O'), dtype('float32')}
Dataset saved to c:\Users\eloip\Documents\datathon_2024\Dream_Team_2024\dataset\df_test.csv
