Database Setup (SQLAlchemy and Pandas)

In [4]:
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import create_engine, Column, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

Base = declarative_base()

class TrainingData(Base):
    """Table to store training data."""
    __tablename__ = 'training_data'
    x = Column(Float, primary_key=True)
    y1 = Column(Float)
    y2 = Column(Float)
    y3 = Column(Float)
    y4 = Column(Float)

class IdealFunctions(Base):
    """Table to store ideal functions."""
    __tablename__ = 'ideal_functions'
    x = Column(Float, primary_key=True)
    f1 = Column(Float)
    f2 = Column(Float)
    f3 = Column(Float)
    f4 = Column(Float)
    f5 = Column(Float)
    f6 = Column(Float)
    f7 = Column(Float)
    f8 = Column(Float)
    f9 = Column(Float)
    f10 = Column(Float)
    f11 = Column(Float)
    f12 = Column(Float)
    f13 = Column(Float)
    f14 = Column(Float)
    f15 = Column(Float)
    f16 = Column(Float)
    f17 = Column(Float)
    f18 = Column(Float)
    f19 = Column(Float)
    f20 = Column(Float)
    f21 = Column(Float)
    f22 = Column(Float)
    f23 = Column(Float)
    f24 = Column(Float)
    f25 = Column(Float)
    f26 = Column(Float)
    f27 = Column(Float)
    f28 = Column(Float)
    f29 = Column(Float)
    f30 = Column(Float)
    f31 = Column(Float)
    f32 = Column(Float)
    f33 = Column(Float)
    f34 = Column(Float)
    f35 = Column(Float)
    f36 = Column(Float)
    f37 = Column(Float)
    f38 = Column(Float)
    f39 = Column(Float)
    f40 = Column(Float)
    f41 = Column(Float)
    f42 = Column(Float)
    f43 = Column(Float)
    f44 = Column(Float)
    f45 = Column(Float)
    f46 = Column(Float)
    f47 = Column(Float)
    f48 = Column(Float)
    f49 = Column(Float)
    f50 = Column(Float)
    

class TestResults(Base):
    """Table to store test data results."""
    __tablename__ = 'test_results'
    x = Column(Float, primary_key=True)
    y = Column(Float)
    chosen_function = Column(Float)
    deviation = Column(Float)

class DatabaseHandler:
    """Class to handle all database operations."""
    
    def __init__(self, db_url):
        self.engine = create_engine(db_url)
        Base.metadata.create_all(self.engine)
        self.Session = sessionmaker(bind=self.engine)
        
    def load_data_to_db(self, csv_file, table_class):
        """Loads CSV data into the respective database table."""
        df = pd.read_csv(csv_file)
        df.to_sql(table_class.__tablename__, self.engine, if_exists='replace', index=False)

    def fetch_training_data(self):
        """Fetches all training data from the database."""
        session = self.Session()
        result = pd.read_sql_table('training_data', self.engine)
        session.close()
        return result
    
    def fetch_ideal_functions(self):
        """Fetches all ideal functions from the database."""
        session = self.Session()
        result = pd.read_sql_table('ideal_functions', self.engine)
        session.close()
        return result


  Base = declarative_base()


Function Selection and Least-Squares Calculation

In [5]:
import numpy as np

class FunctionSelector:
    """Class responsible for selecting the best-fitting functions."""

    def __init__(self, training_data, ideal_functions):
        self.training_data = training_data
        self.ideal_functions = ideal_functions

    def least_square_error(self, y_train, y_ideal):
        """Computes the sum of squared deviations."""
        return np.sum((y_train - y_ideal) ** 2)

    def select_best_functions(self):
        """Selects the best four functions with the least error for each training set."""
        best_functions = {}
        for y_col in ['y1', 'y2', 'y3', 'y4']:
            min_error = float('inf')
            best_func = None
            for f_col in self.ideal_functions.columns[1:]:
                error = self.least_square_error(self.training_data[y_col], self.ideal_functions[f_col])
                if error < min_error:
                    min_error = error
                    best_func = f_col
            best_functions[y_col] = best_func
        return best_functions


Test Data Matching and Deviation Handling

In [6]:
class TestMatcher:
    """Class responsible for matching test data to the chosen ideal functions."""
    
    def __init__(self, test_data, best_functions, training_data, ideal_functions):
        self.test_data = test_data
        self.best_functions = best_functions
        self.training_data = training_data
        self.ideal_functions = ideal_functions

    def match_test_data(self):
        results = []
        for _, test_row in self.test_data.iterrows():
            x_test = test_row['x']
            y_test = test_row['y']

            for y_col, func_col in self.best_functions.items():
                max_training_deviation = np.max(np.abs(self.training_data[y_col] - self.ideal_functions[func_col]))
                ideal_row = self.ideal_functions[self.ideal_functions['x'] == x_test]
                
                if not ideal_row.empty:
                    ideal_y = ideal_row[func_col].values[0]
                    deviation = abs(y_test - ideal_y)
                    
                    if deviation <= max_training_deviation * np.sqrt(2):
                        results.append((x_test, y_test, func_col, deviation))
                        break
        return results


Visualization using Bokeh

In [8]:
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource

class Visualizer:
    """Class responsible for visualizing the data."""
    
    def __init__(self, training_data, ideal_functions, test_results):
        self.training_data = training_data
        self.ideal_functions = ideal_functions
        self.test_results = test_results

    def visualize(self):
        output_file("visualization.html")
        
        p = figure(title="Training Data and Ideal Functions", x_axis_label='x', y_axis_label='y')

        # Plot training data
        for y_col in ['y1', 'y2', 'y3', 'y4']:
            p.line(self.training_data['x'], self.training_data[y_col], legend_label=y_col, line_width=2)

        # Plot ideal functions
        for f_col in self.ideal_functions.columns[1:5]:  # Only showing 4 ideal functions for simplicity
            p.line(self.ideal_functions['x'], self.ideal_functions[f_col], legend_label=f_col, line_dash="dashed")
        
        # Plot test results
        test_source = ColumnDataSource(data=dict(
            x=[r[0] for r in self.test_results],
            y=[r[1] for r in self.test_results],
            deviation=[r[3] for r in self.test_results]
        ))
        p.circle('x', 'y', size=10, source=test_source, color="red", legend_label="Test Data")

        show(p)


Exception Handling

In [13]:
class DataLoadError(Exception):
    """Custom exception for errors while loading data."""
    pass

class DatabaseHandler:
    """Class to handle all database operations."""
    
    def __init__(self, db_url):
        """Initializes the database engine and session."""
        self.engine = create_engine(db_url)
        Base.metadata.create_all(self.engine)
        self.Session = sessionmaker(bind=self.engine)
        
    def load_data_to_db(self, csv_file, table_class):
        """Loads CSV data into the respective database table."""
        df = pd.read_csv(csv_file)
        df.to_sql(table_class.__tablename__, self.engine, if_exists='replace', index=False)

    def fetch_training_data(self):
        """Fetches all training data from the database."""
        session = self.Session()
        result = pd.read_sql_table('training_data', self.engine)
        session.close()
        return result
    
    def fetch_ideal_functions(self):
        """Fetches all ideal functions from the database."""
        session = self.Session()
        result = pd.read_sql_table('ideal_functions', self.engine)
        session.close()
        return result



Main Execution

In [15]:
if __name__ == "__main__":
    db_url = 'sqlite:///data.db'
    db_handler = DatabaseHandler(db_url)

    try:
        # Load datasets into the database
        db_handler.load_data_to_db('train.csv', TrainingData)
        db_handler.load_data_to_db('ideal.csv', IdealFunctions)
        test_data = pd.read_csv('test.csv')

        # Fetch training data and ideal functions
        training_data = db_handler.fetch_training_data()
        ideal_functions = db_handler.fetch_ideal_functions()

        # Select the best functions based on the least square error
        selector = FunctionSelector(training_data, ideal_functions)
        best_functions = selector.select_best_functions()

        # Match test data to the best functions
        matcher = TestMatcher(test_data, best_functions, training_data, ideal_functions)
        test_results = matcher.match_test_data()

        # Visualize the data
        visualizer = Visualizer(training_data, ideal_functions, test_results)
        visualizer.visualize()

    except Exception as e:
        print(f"An error occurred: {e}")


Unit Testing

In [None]:
import unittest
import numpy as np

class TestFunctionSelector(unittest.TestCase):
    def test_least_square_error(self):
        selector = FunctionSelector(None, None)
        y_train = np.array([1, 2, 3])
        y_ideal = np.array([1, 2, 2.5])
        error = selector.least_square_error(y_train, y_ideal)
        self.assertEqual(error, 0.25)

if __name__ == '__main__':
    unittest.main()
