In [1]:
from IPython.display import display, HTML
from warnings import warn
import pandas as pd
import numpy as np
import re

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
csv = np.genfromtxt("https://raw.githubusercontent.com/2332575Y/CSV_search/main/data.csv",delimiter=",", dtype=str)
# Player ID has duplicate values, so I add an extra column "ID" which will be a unique, numerical value
header =  np.hstack(['id', [re.sub(r'\W+', '', x.lower()) for x in csv[0]]])
data = csv[1:]
data = np.hstack([np.arange(data.shape[0]).reshape(data.shape[0],1), data])
#unloads the the csv file
del csv

In [3]:
pd.set_option('display.max_columns', None)
def print_arr(arr):
    df = pd.DataFrame(arr, columns =header)
    display(df)

print_arr(data)

Unnamed: 0,id,playerid,yearid,stint,teamid,lgid,g,ab,r,h,2b,3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,0,abercda01,1871,1,TRO,,1,4,0,0,0,0,0,0,0,0,0,0,,,,,0
1,1,addybo01,1871,1,RC1,,25,118,30,32,6,0,0,13,8,1,4,0,,,,,0
2,2,allisar01,1871,1,CL1,,29,137,28,40,4,5,0,19,3,1,2,5,,,,,1
3,3,allisdo01,1871,1,WS3,,27,133,28,44,10,2,2,27,1,1,0,2,,,,,0
4,4,ansonca01,1871,1,RC1,,25,120,29,39,11,3,0,16,6,2,2,1,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112179,112179,zimmebr01,2022,1,TOR,AL,77,76,11,8,4,0,2,3,2,1,5,33,0,5,1,0,0
112180,112180,zimmebr01,2022,2,PHI,NL,9,16,4,4,1,0,0,0,0,0,0,4,0,0,0,0,0
112181,112181,zimmebr01,2022,3,TOR,AL,23,13,3,1,0,0,0,2,1,1,0,8,0,1,0,0,0
112182,112182,zimmebr02,2022,1,BAL,AL,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
class Repository:
    def __init__(self, dataset, header, index_all=True):
        self.indices = {}
        self.data = dataset
        self.header = header
        self.col_name_to_number = {header[i] : i for i in range(len(header))}
        self.reset()
        if index_all:
            self.index_all()
    
    def reset(self):
        self.selected_rows = set()
        self.first_query = True
        
    def get_col_number(self, column):
        if isinstance(column, int) and column<self.data.shape[1]:
            return column
        elif isinstance(column, str):
            column = re.sub(r'\W+', '', column.lower())
            if column in self.col_name_to_number:
                return self.col_name_to_number[column]
        raise KeyError('Invalid column name/number : %s' %str(column))
        
    def index_all(self):
        for i in range(len(self.header)):
            self.create_index(i)
            
    def create_index(self,column):
        column_number = self.get_col_number(column)
        if column_number == 0:
            return
        if column_number in self.indices:
            warn("Index already exists for %s" % self.header[column_number])
        else:
            self.indices[column_number]={}
            view = self.data[:,[0,column_number]]
            for i in range(view.shape[0]):
                row_id = view[i,0]
                key = view[i,1]
                if key not in self.indices[column_number]:
                    self.indices[column_number][key] = {int(row_id)}
                else:
                    self.indices[column_number][key].add(int(row_id))
                    
    def get_unique_values(self, column):
        column_number = self.get_col_number(column)
        if column_number in self.indices:
            return self.indices[column_number].keys()
        else:
            raise ValueError('Column %s does not have an index!' %str(column))
        
    def query(self, column, condition):
        column_number = self.get_col_number(column)
        if column_number in self.indices:
            col_values = list(filter(condition, self.indices[column_number]))
            rows = [self.indices[column_number][value] for value in col_values]
            if self.first_query:
                self.first_query = False
                self.selected_rows = self.selected_rows.union(*rows)
            else:
                self.selected_rows = self.selected_rows.intersection(*rows)
        else:
            raise ValueError('Column %s does not have an index! consider using a filter instead.' %str(column))
        return self
    
    def collect(self):
        result = self.data[list(self.selected_rows)]
        self.reset()
        return result

repo = Repository(data,header)

In [5]:
print_arr(repo.query("yearid",lambda x: int(x) == 1879).query("teamid",lambda x: x.lower() == "sr1").collect())

Unnamed: 0,id,playerid,yearid,stint,teamid,lgid,g,ab,r,h,2b,3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,1059,creamge01,1879,1,SR1,NL,15,60,3,13,2,0,0,3,,,1,2,,,,,
1,1062,deckefr01,1879,1,SR1,NL,3,10,0,1,0,0,0,0,,,0,3,,,,,
2,1096,holbebi01,1879,1,SR1,NL,59,229,11,46,0,0,0,21,,,1,20,,,,,
3,1065,dorgami01,1879,1,SR1,NL,59,270,38,72,11,5,1,17,,,4,13,,,,,
4,1130,osterch01,1879,1,SR1,NL,2,8,0,0,0,0,0,0,,,0,0,,,,,
5,1163,woodhre01,1879,1,SR1,NL,34,131,4,21,1,0,0,2,,,0,23,,,,,
6,1133,purcebl01,1879,1,SR1,NL,63,277,32,72,6,3,0,25,,,3,13,,,,,
7,1039,adamsge01,1879,1,SR1,NL,4,13,0,3,0,0,0,0,,,1,1,,,,,
8,1040,allenja01,1879,1,SR1,NL,11,48,7,9,2,1,0,3,,,1,5,,,,,
9,1071,farreja02,1879,1,SR1,NL,54,241,40,73,6,2,1,21,,,3,13,,,,,
