In [48]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

The first block of code identifies "control" and "calibrator" testids that are the same. 

In [49]:
import pandas as pd

# Read the data
df = pd.read_csv('1.1seconddataset.csv')

# Filter rows where 'sampletypename' is either 'control' or 'calibrator'
df_filtered = df[df['sampletypename'].isin(['Control', 'Calibrator'])]

# Group by 'testid' and get the count of each test ID
sample_groups = df_filtered.groupby('testid').size()

# Filter to only include test IDs that appear more than once (same test ID in multiple rows)
duplicate_sample_groups = sample_groups[sample_groups > 1]

# Print out the test IDs that have duplicates along with their counts
print("Test IDs that appear more than once (with 'control' or 'calibrator' sample type):")
print(duplicate_sample_groups)





Test IDs that appear more than once (with 'control' or 'calibrator' sample type):
testid
311322      2
311323      2
311324      2
311325      2
311326      2
           ..
11994793    2
11994794    2
11994795    2
11994796    2
11994800    2
Length: 426, dtype: int64


This block compares calibrator data. I ensured that they were from the same reagent lot 

In [53]:
import pandas as pd

# Read the data
df = pd.read_csv('1.1seconddataset.csv')

# Check column names to ensure correct naming
print("Columns in the dataset:", df.columns)

# Filter rows where 'sampletypename' is 'Calibrator' (omit 'Control')
df = df[df['sampletypename'] == 'Calibrator']

# Group by both 'testid' and 'assaynumber' to calculate the max and min of 'correctedcount' for each group, keeping the reagentmasterlotnumber as well
test_groups = df.groupby(['testid', 'assaynumber']).agg(
    min_value=('correctedcount', 'min'),
    max_value=('correctedcount', 'max'),
    min_lot=('reagentmasterlotnumber', 'first'),  # Get the reagentmasterlotnumber for the minimum value
    max_lot=('reagentmasterlotnumber', 'first')   # Get the reagentmasterlotnumber for the maximum value
)

# Filter out the test groups where the min value is 0, 1, or less than 1000 to avoid infinite or trivial percent differences
test_groups = test_groups[(test_groups['min_value'] != 0) & (test_groups['min_value'] != 1) & (test_groups['min_value'] > 1000)]

# Calculate the percent difference between max and min values for each testid and assaynumber combination
test_groups['percent_difference'] = ((test_groups['max_value'] - test_groups['min_value']) / test_groups['min_value']) * 100

# Filter out rows where the percent difference is 0 or greater than 100
test_groups = test_groups[(test_groups['percent_difference'] != 0)]

# Sort the test groups by the highest percent difference
sorted_test_groups = test_groups.sort_values(by='percent_difference', ascending=False)

# Show the test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and only 'calibrator' sample types)
print("Test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and 'calibrator' sample types):")
print(sorted_test_groups[['percent_difference', 'min_lot', 'max_lot']])


Columns in the dataset: Index(['datetimestamplocal', 'moduleserialnumber', 'assaynumber',
       'sampletypename', 'testid', 'correctedcount', 'reagentmasterlotnumber',
       'controllotnumber', 'SID'],
      dtype='object')
Test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and 'calibrator' sample types):
Empty DataFrame
Columns: [percent_difference, min_lot, max_lot]
Index: []


This block compares control data. I am a little confused though because I was under the impression that these controls were tested one after the other. But some of the repeating ones are tested on different dates? I don't know if we should compare them. This is JUST FOR THE first data set though. This may be different for the big one. 

In [54]:
import pandas as pd

# Read the data
df = pd.read_csv('1.1seconddataset.csv')

# Check column names to ensure correct naming
print("Columns in the dataset:", df.columns)

# Filter rows where 'sampletypename' is 'Control' (omit 'Calibrator')
df = df[df['sampletypename'] == 'Control']

# Group by both 'testid' and 'assaynumber' and calculate the max and min of 'correctedcount' for each group
test_groups = df.groupby(['testid', 'assaynumber'])['correctedcount'].agg(['max', 'min'])

# Filter out the test groups where the min value is 0, 1, or less than 1000 to avoid infinite or trivial percent differences
test_groups = test_groups[(test_groups['min'] != 0) & (test_groups['min'] != 1) & (test_groups['min'] > 1000)]

# Calculate the percent difference between max and min values for each testid and assaynumber combination
test_groups['percent_difference'] = ((test_groups['max'] - test_groups['min']) / test_groups['min']) * 100

# Filter out rows where the percent difference is 0 or greater than 100
test_groups = test_groups[(test_groups['percent_difference'] != 0) ]

# Sort the test groups by the highest percent difference
sorted_test_groups = test_groups.sort_values(by='percent_difference', ascending=False)

# Show the test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and only 'control' sample types)
print("Test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and 'control' sample types):")
print(sorted_test_groups[['percent_difference']])


Columns in the dataset: Index(['datetimestamplocal', 'moduleserialnumber', 'assaynumber',
       'sampletypename', 'testid', 'correctedcount', 'reagentmasterlotnumber',
       'controllotnumber', 'SID'],
      dtype='object')
Test groups with the largest percent difference (excluding min values of 0 or 1, percent difference == 0, percent difference > 1000, and 'control' sample types):
                      percent_difference
testid   assaynumber                    
4973794  468                62939.916260
11994781 248                 1593.553932
5646875  468                  964.596707
11994792 248                  451.462441
4973793  468                  356.213018
5646874  468                   15.893588


In [58]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the CSV file
df = pd.read_csv('1.1seconddataset.csv')

# Manually define the test IDs you want to graph
desired_sample_id = '4973794'  # Replace with your actual sample ID

# Filter the dataframe for the desired sample ID
df_filtered = df[df['testid'] == desired_sample_id]

# Ensure 'datetimestamplocal' is in datetime format for proper sorting
df_filtered['datetimestamplocal'] = pd.to_datetime(df_filtered['datetimestamplocal'])

# Sort by 'datetimestamplocal' to identify the first and second test
df_filtered = df_filtered.sort_values(by='datetimestamplocal')

# Add a column to label the tests as "First Test" and "Second Test"
df_filtered['test_type'] = ['First Test', 'Second Test']

# We now only need the two data points (first and second tests)
# Extract the relevant columns for plotting
df_plot = df_filtered[['test_type', 'correctedcount']]

# Plot the data points
plt.figure(figsize=(8, 5))

# Plot the points: one for "First Test" and one for "Second Test"
plt.scatter(df_plot['test_type'], df_plot['correctedcount'], color='blue', zorder=5)

# Connect the points with a line
plt.plot(df_plot['test_type'], df_plot['correctedcount'], color='red', zorder=3)

# Labels and title
plt.title(f'Corrected Count for {desired_sample_id}')
plt.xlabel('Test Type')
plt.ylabel('Corrected Count')

# Show the plot
plt.tight_layout()
plt.show()




ConversionError: Failed to convert value(s) to axis units: masked_array(data=[--, --],
             mask=[ True,  True],
       fill_value=1e+20,
            dtype=float64)

<Figure size 800x500 with 1 Axes>