In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

df = pd.read_csv('bets.csv')

df = df.drop_duplicates()

imputer = SimpleImputer(strategy='most_frequent')
df['sport_category'] = imputer.fit_transform(df[['sport_category']]).ravel()

categorical_features = ['bet_type', 'sport', 'sport_category', 'estimated_risk']
numerical_features = ['odds', 'stake', 'gain', 'GGR', 'net_gain', 'profitability']

categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])

numerical_transformer = Pipeline([('imputer', SimpleImputer(strategy='mean')),('scaler', StandardScaler())])

preprocessor = ColumnTransformer([('num', numerical_transformer, numerical_features),('cat', categorical_transformer, categorical_features)])

X = df.drop(columns=['is_win'])
y = df['is_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

pd.DataFrame(X_train_processed).to_csv('X_train_preprocessed.csv', index=False)
pd.DataFrame(X_test_processed).to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)
