 Programming for Data Science (Autumn 2023)


# Programming Task 1

### Requirement FR1 - Develop a function to read a single column from a CSV file

In [3]:
def read_a_column(file_path, column_index):
    """
    Reads a single column from a CSV file.

    Args:
        file_path (str): The path to the CSV file.
        column_index (int): The index of the column to be read.

    Returns:
        tuple: A tuple containing the header (str) and the column values (list).

    Raises:
        IndexError: If the specified column index is out of bounds.
    """
    # Initializing an empty list to store the column values
    output = []

    # Opening the CSV file in read mode
    with open(file_path, 'r') as file:
        # Reading the header line from the file
        header_line = file.readline()

        # Removing unwanted characters from the header line
        header_line = header_line.replace("ï»¿" or '\n', "")

        # Splitting the header line into a list of words
        headers = header_line.split(',')

        try:
            # Extracting the header for the specified column index
            header = headers[column_index]
        except IndexError:
            # Raise an IndexError if the specified column index is out of bounds
            raise IndexError("Column index is out of bounds.")

        # Looping through the remaining lines in the file to extract column values
        for line in file.readlines():
            # Removing newline characters from each line
            line = line.replace("\n", "")

            # Splitting each line into a list of words
            words = line.split(',')

            try:
                # Attempting to convert the value to an integer and appending to the output list
                output.append(int(words[column_index]))
            except ValueError:
                # If conversion fails, append the value as is to the output list
                output.append(words[column_index])

    return header, output

# Example usage
header, list1 = read_a_column('task1.csv', 5)
print(header, list1)


vaccine [76, 79, 77, 47, 84, 73, 76, 66, 86, 62, 71, 79, 79, 76, 70, 74, 85, 63, 70, 76, 67, 77, 63, 90, 65, 54, 73, 78, 60, 74, 78, 91, 76, 87, 60, 74, 70, 88, 76, 70, 72, 77, 72, 79, 75, 72, 69, 74, 69, 99, 74, 87, 80, 82, 80, 80, 71, 72, 78, 85, 80, 69, 70, 80, 71, 88, 68, 84, 55, 62, 80, 88, 70, 70, 83, 77, 83, 63, 91, 70, 100, 79, 90, 70, 64, 69, 73, 81, 81, 68, 73, 64, 98, 61, 80, 74, 67, 74, 67, 62, 64, 74, 92, 69, 71, 58, 65, 91, 72, 67, 85, 76, 60, 76, 71, 66, 58, 71, 64, 58, 47, 44, 74, 67, 74, 79, 73, 56, 70, 74, 78, 71, 80, 83, 73, 61, 62, 79, 78, 76, 80, 62, 64, 64, 65, 61, 69, 64, 83, 76, 74, 52, 70, 66, 64, 63, 65, 76, 89, 78, 68, 95, 59, 85, 78, 76, 56, 83, 53, 71, 73, 72, 78, 85, 87, 70, 77, 75, 68, 88, 69, 70, 91, 70, 68, 65, 68, 78, 71, 0, 61, 80, 84, 65, 56, 69, 78, 74, 76, 80, 67, 77, 80, 72, 60, 76, 75, 61, 89, 85]


### Requirement FR2 - Develop a function to read CSV data from a file into memory

In [8]:
def read_into_memory(file_path):
    """
    Reads all columns from a CSV file into a dictionary.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        dict: A dictionary where keys are column headers and values are lists of column values.
    """
    # Initializing a flag to read all columns
    flag = True

    # Initializing an output dictionary
    output = {}

    # Initializing a column counter
    i = 0

    # Looping until all columns are captured
    while flag:
        try:
            # Attempting to read a column using the read_a_column function
            key, value = read_a_column(file_path, i)

            # Incrementing the column counter
            i += 1

            # Adding the column to the output dictionary
            output[key] = value

        except IndexError:
            # If an IndexError occurs, indicating the end of columns, return the output dictionary
            return output

# Example usage
csv_data = read_into_memory('task1.csv')
print(csv_data)
            

          

{'cancer': [70, 70, 68, 53, 75, 69, 70, 63, 73, 66, 70, 63, 68, 69, 72, 65, 69, 61, 68, 67, 68, 65, 65, 68, 64, 63, 68, 69, 61, 72, 63, 75, 67, 72, 63, 69, 73, 70, 69, 73, 69, 73, 65, 70, 64, 64, 63, 68, 70, 70, 70, 62, 81, 69, 72, 69, 66, 67, 70, 84, 65, 65, 63, 81, 65, 67, 66, 67, 61, 76, 66, 70, 67, 70, 73, 63, 62, 82, 75, 65, 74, 68, 81, 76, 57, 65, 62, 64, 65, 63, 69, 65, 100, 65, 62, 66, 64, 61, 64, 60, 55, 64, 57, 63, 59, 66, 68, 70, 70, 51, 58, 57, 55, 68, 67, 72, 67, 58, 61, 60, 58, 67, 68, 66, 62, 59, 66, 64, 60, 63, 65, 55, 56, 63, 59, 60, 65, 73, 65, 65, 61, 64, 65, 63, 70, 59, 67, 68, 66, 64, 64, 65, 68, 57, 68, 65, 64, 66, 72, 68, 67, 64, 67, 57, 59, 52, 63, 58, 47, 58, 61, 64, 62, 60, 65, 58, 57, 64, 49, 59, 59, 59, 67, 59, 55, 57, 57, 65, 61, 58, 59, 61, 61, 59, 64, 56, 58, 59, 64, 57, 56, 60, 61, 57, 59, 53, 59, 56, 58, 63], 'cardiovascular': [37, 34, 24, 38, 35, 38, 37, 32, 35, 36, 35, 31, 42, 47, 32, 30, 34, 30, 39, 35, 50, 40, 50, 30, 37, 50, 33, 39, 40, 39, 43, 39,

### Requirement FR3 - Develop a function to calculate the Kendall Tau Correlation Coefficient for two lists of data

In [12]:
def kendalltau(list1, list2):
    """Calculate the Kendall Tau correlation coefficient for two lists."""

    class LengthError(Exception):
        """Error for lists of different lengths."""
        pass

    if len(list1) != len(list2):
        raise LengthError("Error: The input lists must have the same length.")

    len_list = len(list1)
    concordant_count = discordant_count = 0

    # Nested loops to compare pairs of elements
    for i in range(len_list):
        for j in range(i + 1, len_list):
            if (list1[i] < list1[j]) and (list2[i] < list2[j]):
                concordant_count += 1
            elif (list1[i] > list1[j]) and (list2[i] > list2[j]):
                discordant_count += 1

    total_pairs = concordant_count + discordant_count

    # Handle division by zero =)
    if total_pairs == 0:
        tau = 0
    else:
        tau = (concordant_count - discordant_count) / total_pairs

    return round(tau, 4)

# Example usage
list1 = [1, 2, 3, 4, 5, 6, 7, 8]
list2 = [21, 22, 23, 24, 25, 26, 27, 28]
result = kendalltau(list1, list2)
print(result)

list1=[9, 5, 6, 1, 8, 7, 3]
list2=[3, 0, 7, 2, 6, 5, 4]
print(kendalltau(list1,list2))

1.0
-0.0769


### Requirement FR4 - Develop a function to generate a set of Kendall Tau Correlation Coefficients for a data structure like the one generated in FR2

In [14]:
def calculate_kendalltau_correlations(column_data):
    """Calculate Kendall Tau Correlation Coefficients for all pairs of columns."""
    
    occupied_pairs = []  # to avoid redundant correlations
    correlations = []    # list to store correlation results
    
    for key1, value1 in column_data.items():
        for key2, value2 in column_data.items():
            if (key1 != key2) and ({key1, key2} not in occupied_pairs):
                tau = kendalltau(value1, value2)
                correlations.append((key1, key2, tau))
                occupied_pairs.append({key2, key1})
    
    return correlations

# Example usage
set_of_correlations = calculate_kendalltau_correlations(read_into_memory('task1.csv'))
print(set_of_correlations)
print(len(set_of_correlations))


[('cancer', 'cardiovascular', -0.4722), ('cancer', 'stroke', -0.3218), ('cancer', 'depression', -0.4089), ('cancer', 'rehab', -0.5048), ('cancer', 'vaccine', -0.3972), ('cancer', 'diarrhea', -0.3893), ('cancer', 'obesity', -0.3918), ('cancer', 'diabetes\n', -0.3628), ('cardiovascular', 'stroke', -0.1191), ('cardiovascular', 'depression', -0.2484), ('cardiovascular', 'rehab', -0.3161), ('cardiovascular', 'vaccine', -0.2254), ('cardiovascular', 'diarrhea', -0.2261), ('cardiovascular', 'obesity', -0.1478), ('cardiovascular', 'diabetes\n', -0.1345), ('stroke', 'depression', -0.0842), ('stroke', 'rehab', -0.1971), ('stroke', 'vaccine', -0.0573), ('stroke', 'diarrhea', -0.0607), ('stroke', 'obesity', -0.0307), ('stroke', 'diabetes\n', -0.0134), ('depression', 'rehab', -0.3494), ('depression', 'vaccine', -0.1386), ('depression', 'diarrhea', -0.1491), ('depression', 'obesity', -0.1196), ('depression', 'diabetes\n', -0.1206), ('rehab', 'vaccine', -0.3203), ('rehab', 'diarrhea', -0.2734), ('reha

### Requirement FR5 - Develop a function to print a custom table for selected data from a data structure like the one generated in FR4

In [25]:
def print_kendalltau_correlation_table(correlations, border, columns):
    """Print a table of Kendall Tau Correlation Coefficients."""
    
    max_len = max(len(col) for col in columns)

    # Print header row
    header_row = f'{" ".center(max_len + 2)}' + ''.join([f'{border}{col.center(max_len)}' for col in columns]) + border
    print(header_row)

    # Print border
    print(border * ((max_len + 1) * (len(columns) + 1) + 2))

    for col1 in columns:
        corr_collect = []
        table_row = f'{border}{col1.center(max_len + 1)}{border}'  # First column of each row

        for col2 in columns:
            if col1 == col2:
                corr = '-'  # Placeholder for the same column
            else:
                for i in range(len(correlations)):
                    if (
                        (col1 == correlations[i][0] or col1 == correlations[i][1]) and
                        (col2 == correlations[i][0] or col2 == correlations[i][1]) and
                        (correlations[i][2] not in corr_collect)
                    ):
                        corr = correlations[i][2]
                        corr_collect.append(corr)

            table_row += f'{str(corr).center(max_len)}{border}'

        print(table_row+'\n')

    # Print bottom border
    print(border * ((max_len + 1) * (len(columns) + 1)))

# Example usage
print_kendalltau_correlation_table(set_of_correlations, '.', ['cancer', 'cardiovascular', 'obesity', 'vaccine', 'rehab'])

                .    cancer    .cardiovascular.   obesity    .   vaccine    .    rehab     .
............................................................................................
.     cancer    .      -       .   -0.4722    .   -0.3918    .   -0.3972    .   -0.5048    .

. cardiovascular.   -0.4722    .      -       .   -0.1478    .   -0.2254    .   -0.3161    .

.    obesity    .   -0.3918    .   -0.1478    .      -       .   -0.0837    .   -0.2796    .

.    vaccine    .   -0.3972    .   -0.2254    .   -0.0837    .      -       .   -0.3203    .

.     rehab     .   -0.5048    .   -0.3161    .   -0.2796    .   -0.3203    .      -       .

..........................................................................................
