In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Machine Learning Frameworks
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Gradient Boosting Libraries (optional)
import xgboost as xgb
import lightgbm as lgb

In [3]:
df = pd.read_csv('combined_data.csv')
df.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,dist,weightSt,weightLb,RPR,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place,Year_horse,course,time,title,distance,condition,winningTime,metric,countryCode,class,Year_race,total_prize,min_age,max_age,prize_1,prize_2,prize_3,prize_4,prize_5,prize_6,prize_7,prize_8,prize_9,prize_10,prize_11,prize_12,prize_13,prize_14,prize_15,prize_16,prize_17,prize_18
0,271018,67106,22,0,484,False,14854,4059,1,1,18.712869,7,0,111.0,94.0,79.234051,31,113321,9419,12,1.521003,69,True,True,1990-01-01,115,1900-01-01 03:15:00,58423,3.125,14,398.3,5028.0,12,0,1990-01-01,4409.0,6.0,8.0,2922.5,875.0,420.0,192.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,271018,280298,22,0,377,False,2658,16080,2,12,18.712869,7,0,101.0,88.0,79.234051,6777,2947,5868,12,1.521003,69,False,True,1990-01-01,115,1900-01-01 03:15:00,58423,3.125,14,398.3,5028.0,12,0,1990-01-01,4409.0,6.0,8.0,2922.5,875.0,420.0,192.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,271018,164632,23,0,218,False,6074,16041,3,82,25.0,6,7,86.0,71.0,79.234051,9167,67504,550,12,1.521003,66,False,True,1990-01-01,115,1900-01-01 03:15:00,58423,3.125,14,398.3,5028.0,12,0,1990-01-01,4409.0,6.0,8.0,2922.5,875.0,420.0,192.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,271018,200213,24,0,377,False,13635,6371,4,117,45.0,7,0,66.0,65.0,79.234051,5828,63323,2475,12,1.521003,69,False,False,1990-01-01,115,1900-01-01 03:15:00,58423,3.125,14,398.3,5028.0,12,0,1990-01-01,4409.0,6.0,8.0,2922.5,875.0,420.0,192.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,271018,347084,24,0,277,False,16945,15076,5,255,75.0,7,0,72.310392,45.0,79.234051,3184,129340,5464,12,1.521003,69,False,False,1990-01-01,115,1900-01-01 03:15:00,58423,3.125,14,398.3,5028.0,12,0,1990-01-01,4409.0,6.0,8.0,2922.5,875.0,420.0,192.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
df.select_dtypes(include=['object']).columns

Index([], dtype='object')

Change the data type of the columns to datetime

In [13]:
df['Year_horse'] = pd.to_datetime(df['Year_horse'])
df['Year_race'] = pd.to_datetime(df['Year_race'])   
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')


# Scaling Numerical Features

In [16]:
numeric_features = df.select_dtypes(include=['float64']).columns

# Scale the data
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
df.head()

Unnamed: 0,rid,horseName,age,saddle,decimalPrice,isFav,trainerName,jockeyName,position,positionL,dist,weightSt,weightLb,RPR,TR,OR,father,mother,gfather,runners,margin,weight,res_win,res_place,Year_horse,course,time,title,distance,condition,winningTime,metric,countryCode,class,Year_race,total_prize,min_age,max_age,prize_1,prize_2,prize_3,prize_4,prize_5,prize_6,prize_7,prize_8,prize_9,prize_10,prize_11,prize_12,prize_13,prize_14,prize_15,prize_16,prize_17,prize_18
0,271018,67106,22,0,484,False,14854,4059,1,1,1.970662e-16,7,0,1.516586,1.987373,-3.099521e-15,31,113321,9419,12,1.745543,69,True,True,1990-01-01,115,1900-01-01 03:15:00,58423,2.209084,14,2.138635,2.090976,12,0,1990-01-01,-0.193147,2.868113,-1.374619,-0.184258,-0.185778,-0.183383,-0.165479,-0.149927,-0.087228,-0.054664,-0.045205,-0.02275,-0.019905,-0.007656,-0.006707,-0.017654,-0.014583,-0.009219,-0.008793,-0.007229,-0.005716
1,271018,280298,22,0,377,False,2658,16080,2,12,1.970662e-16,7,0,1.124598,1.70288,-3.099521e-15,6777,2947,5868,12,1.745543,69,False,True,1990-01-01,115,1900-01-01 03:15:00,58423,2.209084,14,2.138635,2.090976,12,0,1990-01-01,-0.193147,2.868113,-1.374619,-0.184258,-0.185778,-0.183383,-0.165479,-0.149927,-0.087228,-0.054664,-0.045205,-0.02275,-0.019905,-0.007656,-0.006707,-0.017654,-0.014583,-0.009219,-0.008793,-0.007229,-0.005716
2,271018,164632,23,0,218,False,6074,16041,3,82,0.3487421,6,7,0.536616,0.896818,-3.099521e-15,9167,67504,550,12,1.745543,66,False,True,1990-01-01,115,1900-01-01 03:15:00,58423,2.209084,14,2.138635,2.090976,12,0,1990-01-01,-0.193147,2.868113,-1.374619,-0.184258,-0.185778,-0.183383,-0.165479,-0.149927,-0.087228,-0.054664,-0.045205,-0.02275,-0.019905,-0.007656,-0.006707,-0.017654,-0.014583,-0.009219,-0.008793,-0.007229,-0.005716
3,271018,200213,24,0,377,False,13635,6371,4,117,1.458126,7,0,-0.24736,0.612325,-3.099521e-15,5828,63323,2475,12,1.745543,69,False,False,1990-01-01,115,1900-01-01 03:15:00,58423,2.209084,14,2.138635,2.090976,12,0,1990-01-01,-0.193147,2.868113,-1.374619,-0.184258,-0.185778,-0.183383,-0.165479,-0.149927,-0.087228,-0.054664,-0.045205,-0.02275,-0.019905,-0.007656,-0.006707,-0.017654,-0.014583,-0.009219,-0.008793,-0.007229,-0.005716
4,271018,347084,24,0,277,False,16945,15076,5,255,3.122202,7,0,0.0,-0.335984,-3.099521e-15,3184,129340,5464,12,1.745543,69,False,False,1990-01-01,115,1900-01-01 03:15:00,58423,2.209084,14,2.138635,2.090976,12,0,1990-01-01,-0.193147,2.868113,-1.374619,-0.184258,-0.185778,-0.183383,-0.165479,-0.149927,-0.087228,-0.054664,-0.045205,-0.02275,-0.019905,-0.007656,-0.006707,-0.017654,-0.014583,-0.009219,-0.008793,-0.007229,-0.005716


In [4]:
X = df.drop(['res_win', 'res_place'], axis=1)
y1 = df['res_win']
y2 = df['res_place']

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df[['res_win', 'res_place']], test_size=0.2, random_state=42)

# Initialize a base classifier
base_model = RandomForestClassifier(random_state=42)

# Wrap the classifier in MultiOutputClassifier
multi_target_model = MultiOutputClassifier(base_model)

# Train the model
multi_target_model.fit(X_train, y_train)

# Make predictions
y_pred = multi_target_model.predict(X_test)

# Evaluate the model
print("Accuracy (res_win):", accuracy_score(y_test['res_win'], y_pred[:, 0]))
print("Accuracy (res_place):", accuracy_score(y_test['res_place'], y_pred[:, 1]))
print("Classification Report (res_win):\n", classification_report(y_test['res_win'], y_pred[:, 0]))
print("Classification Report (res_place):\n", classification_report(y_test['res_place'], y_pred[:, 1]))

In [9]:
# Load dataset (replace with your data)
data = pd.read_csv("dataset.csv")

# Exploratory Data Analysis (EDA)
print(data.head())
print(data.info())

# Splitting data into features and target
X = data.drop("target_column", axis=1)  # Replace 'target_column' with your target variable
y = data["target_column"]

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a simple Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Saving the model
joblib.dump(model, "random_forest_model.pkl")

ValueError: only leading negative signs are allowed

In [8]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, target1, test_size=0.3, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, target2, test_size=0.3, random_state=42)

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred2 = log_reg.predict(X_test2)

# Confusion Matrix
confusion_matrix(y_test, y_pred)
confusion_matrix(y_test2, y_pred2)

# Classification Report
print(classification_report(y_test, y_pred))
print(classification_report(y_test2, y_pred2))

ValueError: could not convert string to float: '2013-01-01'