# Install and Imports

In [511]:
# install(packages) and import_packages(packages) functions
import subprocess
import importlib

def install(packages):
    """
    Installs a Python packages with pip and gives a summary of the status.

    Args:
        packages (list of str): A list of package names to be installed.

    The function attempts to install each package and summarizes the results:
    - "installed": for successfully installed packages
    - "already installed": for packages that are already installed
    - "failed": for packages that couldn't be installed
    - "error at installing": for packages where an installation error occurred
    """
    
    installed = []
    already_installed = []
    failed = []
    errors = []

    packages.sort() # Sort the packages to make the output more readable
    
    for package in packages: # Loop through packages and try to install them
        try:
            result = subprocess.run(
                ['pip', 'install', package],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                text=True
            )
            
            if "Requirement already satisfied" in result.stdout: #
                already_installed.append(package)
            elif "Successfully installed" in result.stdout:
                installed.append(package)
            else:
                failed.append(package)
        except Exception as e:
            errors.append(f"Error at installing {package}: {e}")
    
    if installed:
        print(f"Installed: {', '.join(installed)}")
    if already_installed:
        print(f"Already installed: {', '.join(already_installed)}")
    if failed:
        print(f"Failed: {', '.join(failed)}")
    if errors:
        print("\n".join(errors))

    print("")

def import_packages(packages):
    """
    Imports a list of Python packages, assigning known aliases to some of them.

    Args:
        packages (list of str): A list of package names to be imported.

    This function attempts to import each package in the provided list. If a package has a predefined alias 
    (e.g., `numpy` as `np` or `pandas` as `pd`), it will be imported with that alias. If no alias is defined, 
    the package will be imported with its original name. For each successful import, a message is printed. 
    If a package cannot be imported, an error message will indicate this and suggest checking if the package 
    is installed.
    """
    aliases = {"numpy": "np", "pandas": "pd"}  # Dictionary of aliases
    imports = []
    for package in packages:
        try:
            if package in aliases:  # Import with alias
                globals()[aliases[package]] = importlib.import_module(package)
                print(f"{package} imported as {aliases[package]}")
            else:  # Import without alias
                globals()[package] = importlib.import_module(package)
                print(f"{package} imported")
            imports.append(package)
        except ImportError:
            print(f"{package} couldn't be imported, is it installed?")

In [512]:
packages = ["pandas","gzip","json","ast"]

In [None]:
install(packages)
import_packages(packages)

# Extraction

In [514]:
# extract function (supports CSV, JSON, and Gzip JSON)
def extract(file_path):
    """
    Extracts data from various file formats and returns it as a DataFrame.

    Supported formats:
    - CSV
    - JSON (gzip compressed or plain)
    - Other formats can be added in the future.

    :param file_path: Path to the file.
    :return: DataFrame with extracted data, or None if extraction fails.
    """
    try:
        # Determine file format by extension
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
            print("CSV data extraction successful.\n")
        elif file_path.endswith('.json.gz'):
            with gzip.open(file_path, 'rb') as file:
                data = [json.loads(row) for row in file]
            df = pd.DataFrame(data)
            print("Gzip JSON data extraction successful.\n")
        elif file_path.endswith('.json'):
            df = pd.read_json(file_path, lines=True)
            print("JSON data extraction successful.\n")
        elif file_path.endswith('.ast.gz'):
            with gzip.open(file_path, 'rb') as file:
                data = [ast.literal_eval(row.decode('utf-8')) for row in file]
            df = pd.DataFrame(data)
            print("AST Gzip data extraction successful.\n")
        else:
            print(f"Unsupported file format: {file_path}")
            return None
        
        return df
    
    except Exception as e:
        print(f"Error during extraction: {e}")
        return None


In [515]:
# df_info function
def df_info(df): #todo add docstring
    # Obtain df name from global
    df_name = [name for name, obj in globals().items() if obj is df][0]
    # determine line length based on df name
    line_length = max(64, len(df_name) + 10)
    print(f"\n---Info-{df_name.replace(' ', '-')}{'-' * (line_length - len(df_name) - 10)}\n")
    print(f"---duplicated values:{df.duplicated().sum()}")
    print(f"---Number of fully empty rows: {df.isnull().all(axis=1).sum()}")
    print(f"\n---Dataframe head:\n{df.head(3).to_string()}\n")
    print(f"\n---Dataframe info:\n")
    print(df.info())
    print(f"\n----Dataframe description:\n{df.describe().to_string()}")
    print(f"\n---Missing values:\n{df.isna().sum()}")

In [None]:
df_train = extract("Raw data/train.csv")
df_test = extract("Raw data/test.csv")

# Data Preprocessing

In [517]:
# column_unique and unique_count functions
def column_unique(df): #todo add docstring
    for column in df.columns:
        unique_values = df[column].unique()  # Get unique values in the column
        
        # Count the number of unique values
        unique_count = len(unique_values)
        
        # Check if all values are different
        if unique_count == len(df[column]):
            print(f"Column: {column}")
            print(f"Unique values ({unique_count}): all values are different")
        else:
            # Check if the column contains only numeric values
            if pd.api.types.is_numeric_dtype(df[column]):
                sorted_values = sorted(unique_values)  # Sort numerically
            else:
                sorted_values = sorted(unique_values, key=str)  # Sort alphabetically if there are strings
            
            print(f"Column: {column}")
            print(f"Unique values ({unique_count}): {', '.join(map(str, sorted_values))}")
        print("-" * 50)

def unique_count(df, column_name): #todo add docstring
    if column_name in df.columns:
        # Get the counts of unique elements, ordered by appearance
        value_counts = df[column_name].value_counts()
        
        # Sort alphabetically for strings, or numerically for numbers
        if pd.api.types.is_numeric_dtype(df[column_name]):
            sorted_values = sorted(value_counts.items())  # Sort numerically by values
        else:
            sorted_values = sorted(value_counts.items(), key=lambda x: str(x[0]))  # Sort alphabetically by keys
        
        print(f"\nUnique values in column '{column_name}':")
        for value, count in sorted_values:
            print(f"{value}: {count} times")
    else:
        print(f"Column '{column_name}' does not exist in the dataframe.")


In [None]:
df_info(df_train)
df_info(df_test)

In [None]:
column_unique(df_train)

In [None]:
unique_count(df_train, "Survived")
unique_count(df_train, "Pclass")
unique_count(df_train, "Sex")
unique_count(df_train, "Age")
unique_count(df_train, "SibSp")
unique_count(df_train, "Parch")
unique_count(df_train, "Ticket")
unique_count(df_train, "Fare")
unique_count(df_train, "Cabin")
unique_count(df_train, "Embarked")

In [None]:
column_unique(df_test)

In [None]:
unique_count(df_test, "PClass")
unique_count(df_test, "Sex")
unique_count(df_test, "Age")
unique_count(df_test, "SibSp")
unique_count(df_test, "Parch")
unique_count(df_test, "Ticket")
unique_count(df_test, "Fare")
unique_count(df_test, "Cabin")
unique_count(df_test, "Embarked")

# Transform

In [523]:
# df_clean function
def df_clean(df): #todo add docstring
    # Obtain df name from global
    df_name = [name for name, obj in globals().items() if obj is df][0]
    # determine line length based on df name
    line_length = max(60, len(df_name) + 10)
    print(f"\n---Cleaning-{df_name.replace(' ', '-')}{'-' * (line_length - len(df_name) - 10)}\n")
    
    # Print number of duplicated rows
    duplicates_before = df.duplicated().sum()
    df.drop_duplicates(inplace=True)
    print(f"{duplicates_before} duplicate rows removed.")
    
    # Print number of missing values
    missing_before = df.isnull().sum().sum()
    df.dropna(how="all", inplace=True)
    print(f"{missing_before} missing values removed.")
    df.reset_index(drop=True, inplace=True)
    print("Index reset.")

In [None]:
df_clean(df_train)
df_clean(df_test)

df_info(df_train)
df_info(df_test)

## New rows creation

## External Data enrichment

# Load

In [526]:
df_train.to_csv('dataframes/df_train.csv', index=False, header=True)
df_test.to_csv('dataframes/df_test.csv', index=False, header=True)