# Part 1

In [1]:
import pandas as pd
import numpy as np
from rich import print

# use converters to preserve leading zeros
data = pd.read_csv('diagnostic_report.txt', converters={0: lambda x: str(x)}, header=None)

In [2]:
data[0]

0      000000011010
1      011001111011
2      100101011101
3      000110000110
4      101010001010
           ...     
995    100001001110
996    111101100111
997    111011111100
998    000011011001
999    000000100001
Name: 0, Length: 1000, dtype: object

In [3]:
data.describe()

Unnamed: 0,0
count,1000
unique,1000
top,11010
freq,1


In [4]:
""" Split into a 12-column dataframe to make this easier. """

df = pd.DataFrame(data[0].map(lambda x: list(x)).tolist())
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,0,0,0,0,0,0,1,1,0,1,0
1,0,1,1,0,0,1,1,1,1,0,1,1
2,1,0,0,1,0,1,0,1,1,1,0,1
3,0,0,0,1,1,0,0,0,0,1,1,0
4,1,0,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,0,0,0,1,0,0,1,1,1,0
996,1,1,1,1,0,1,1,0,0,1,1,1
997,1,1,1,0,1,1,1,1,1,1,0,0
998,0,0,0,0,1,1,0,1,1,0,0,1


In [5]:
""" How many 1s versus 0s in the first column? """

df[0].value_counts()

1    519
0    481
Name: 0, dtype: int64

In [6]:
""" We can get the most common value by using .idmax() """

df[0].value_counts().idxmax()

'1'

In [7]:
""" So, we can apply this concept against the entire dataframe. What is the most common value for each column? """

gamma_df = df.apply(lambda x: x.value_counts().idxmax())
print(gamma_df.values)

In [8]:
""" What is the least common value for each column? """

epsilon_df = df.apply(lambda x: x.value_counts().idxmin())
print(epsilon_df.values)

In [9]:
""" Convert the binary number into a decimal number. """

gamma   = int(''.join(gamma_df), 2)
epsilon = int(''.join(epsilon_df), 2)

print(gamma, epsilon)

In [10]:
""" The answer is the product of our gamma rate and epsilone rate. """

print(np.product([gamma, epsilon]))

# Part 2

## First, test our logic using their example

In [11]:
example_data = pd.Series(['00100','11110','10110','10111','10101','01111','00111','11100','10000','11001','00010','01010'])
example_df = pd.DataFrame(example_data.map(lambda x: list(x)).tolist())
example_df

Unnamed: 0,0,1,2,3,4
0,0,0,1,0,0
1,1,1,1,1,0
2,1,0,1,1,0
3,1,0,1,1,1
4,1,0,1,0,1
5,0,1,1,1,1
6,0,0,1,1,1
7,1,1,1,0,0
8,1,0,0,0,0
9,1,1,0,0,1


In [12]:
def filter_oxygen(dataframe, logging=True):
    d = dataframe.copy()
    for i in range(len(d.columns)):
        # stop once we have a single row
        if len(d) == 1: break
        # get the most common value in the column (aka high)
        high, low = d[i].value_counts(sort=True, ascending=False).items()

        # fancy logic from challenge to find the "oxygen rating"
        if high[1] > low[1]:
            value = high[0]
        elif high[1] < low[1]:
            value = low[0]
        else:
            value = '1'

        if logging:
            print(f'index: {i} high: {high} low: {low} value: {value}')
        
        # remove rows that don't match the value
        d.drop(d[d[i] != value].index, inplace=True)
    return d


def filter_co2(dataframe, logging=True):
    d = dataframe.copy()
    for i in range(len(d.columns)):
        if len(d) == 1: break
        high, low = d[i].value_counts(sort=True, ascending=False).items()

        # the only diff between both functions, but I was lazy and copy/pasted the code 😅
        if high[1] < low[1]:
            value = high[0]
        elif high[1] > low[1]:
            value = low[0]
        else:
            value = '0'

        if logging:
            print(f'index: {i} high: {high} low: {low} value: {value}')
        d.drop(d[d[i] != value].index, inplace=True)
    return d

In [13]:
oxy_df = filter_oxygen(example_df)

In [14]:
co2_df = filter_co2(example_df)

In [15]:
print(oxy_df.values)
print(co2_df.values)

In [16]:
""" Convert the binary number into a decimal number. """

OXYGEN = int(''.join(oxy_df.iloc[0]), 2)
CO2    = int(''.join(co2_df.iloc[0]), 2)

print(OXYGEN, CO2)

In [17]:
""" The answer is the product of our oxygen rate and CO2 rate. """

print(np.product([OXYGEN, CO2]))

## Now continue with the challenge using the real input data

In [18]:
""" Start with our expanded dataframe. """

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0,0,0,0,0,0,0,1,1,0,1,0
1,0,1,1,0,0,1,1,1,1,0,1,1
2,1,0,0,1,0,1,0,1,1,1,0,1
3,0,0,0,1,1,0,0,0,0,1,1,0
4,1,0,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,0,0,0,0,1,0,0,1,1,1,0
996,1,1,1,1,0,1,1,0,0,1,1,1
997,1,1,1,0,1,1,1,1,1,1,0,0
998,0,0,0,0,1,1,0,1,1,0,0,1


In [19]:
""" Apply the function that we defined and tested. """

oxy_df = filter_oxygen(df, logging=False)
co2_df = filter_co2(df, logging=False)
print(oxy_df.values)
print(co2_df.values)

In [20]:
""" Convert the binary number into a decimal number. """

OXYGEN = int(''.join(oxy_df.iloc[0]), 2)
CO2    = int(''.join(co2_df.iloc[0]), 2)

print(OXYGEN, CO2)

In [21]:
""" The answer is the product of our oxygen rate and CO2 rate. """

print(np.product([OXYGEN, CO2]))