# Ciallo STM Group!

Required libraries:
- pandas
- numpy

In [1]:
import pandas as pd

def analyze_csv_structure(file):
    df = pd.read_csv(file)
    
    # Row numbers
    print(f"Total Rows in the Table: {len(df)}\n")
    
    # Columns and types
    print("Column Names and Data Types:")
    for column, dtype in df.dtypes.items():
        print(f"- {column}: {dtype}")
    
    # NaN or missing values
    missing_values = df.isnull().sum()
    if missing_values.any():
        print("\nMissing Values in Each Column:")
        for column, missing_count in missing_values.items():
            if missing_count > 0:
                print(f"- {column}: {missing_count} missing values")
    else:
        print("\nNo missing values found in the table.")
    
    # Preview
    print("\nTable Preview (First 5 Rows):")
    print(df.head())

def analyze_all_columns(file):
    df = pd.read_csv(file)
    
    for column_name in df.columns:
        print(f"\nAnalyzing column: '{column_name}'")
        
        # Get unique values
        unique_values = set(df[column_name].dropna())
        unique_count = len(unique_values)
        print(f"Number of unique values: {unique_count}")
        print(f"Unique values: {unique_values}")

        # Check if the column is numeric
        if pd.api.types.is_float_dtype(df[column_name]):
            # Calculate the range of values
            min_value = df[column_name].min()
            max_value = df[column_name].max()
            value_range = max_value - min_value
            
            print(f"\nColumn '{column_name}' is of float type.")
            print(f"Minimum value: {min_value}")
            print(f"Maximum value: {max_value}")
            print(f"Value range: {value_range}")
        else:
            print(f"Column '{column_name}' is not of float type.")


In [None]:
file1 = "../encoded data/opamps-features.csv"
analyze_csv_structure(file1)

In [None]:
analyze_all_columns(file1)

In [None]:
file2 = "../encoded data/opamps-xref.csv"
analyze_csv_structure(file2)

In [None]:
analyze_all_columns(file2)

In [None]:
def find_rows_with_same_first_column(file):
    # Find if two components has the same MPN id but different MPN name in table 1
    df = pd.read_csv(file)
    
    duplicated_groups = df[df.duplicated(subset=df.columns[0], keep=False)]
    
    if duplicated_groups.empty:
        print("No rows found with duplicate values in the first column.")
    else:
        print("Rows with duplicate values in the first column:")
        print(duplicated_groups)

In [None]:
find_rows_with_same_first_column(file1)

In [None]:
def find_matching_rows(file):
    # Find if two components has the same MPN id but different MPN name
    df = pd.read_csv(file)
    
    if df.shape[1] < 3:
        print("The table must have at least three columns.")
        return

    matching_rows = df[df.iloc[:, 0] == df.iloc[:, 2]]
    
    # Output
    if matching_rows.empty:
        print("No rows found where the values in the first and third columns match.")
    else:
        print("Rows where the first and third column values match:")
        print(matching_rows)

In [None]:
find_matching_rows(file2)

In [2]:
import numpy as np

def analyze_npy_structure(file):
    # Read the .npy file
    data = np.load(file, allow_pickle=True)
    
    # Structure analysis
    print("Data type:", type(data))
    print("Data shape:", data.shape if hasattr(data, 'shape') else "No shape attribute")
    print("Data dtype:", data.dtype if hasattr(data, 'dtype') else "No dtype attribute")
    
    # Preview of data
    if isinstance(data, np.ndarray):
        print("\nPreview of data (first 5 elements):")
        print(data[:5])
    elif isinstance(data, (list, dict)):
        print("\nPreview of data (first few items):")
        for i, item in enumerate(data):
            if i >= 5:
                break
            print(item)
    else:
        print("\nData preview:")
        print(data)

In [3]:
analyze_npy_structure('../results/opamps-scores-example.npy')

Data type: <class 'numpy.ndarray'>
Data shape: (18352, 18352)
Data dtype: float64

Preview of data (first 5 elements):
[[0.9947077  0.9947077  0.4382915  ... 0.5984667  0.5984667  0.5984667 ]
 [0.9947077  0.9947077  0.39232178 ... 0.65109137 0.65109137 0.65109137]
 [0.4382915  0.39232178 0.9947077  ... 0.56951336 0.56951336 0.56951336]
 [0.4131604  0.41831909 0.40969788 ... 0.45082133 0.45082133 0.45082133]
 [0.60970538 0.40590678 0.38641846 ... 0.47005301 0.47005301 0.47005301]]
