In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import FunctionTransformer

In [2]:
df = pd.read_csv('Horse_Win.csv')
df.head()

Unnamed: 0,Tipster,Date,Track,Horse,Bet Type,Odds,Result,TipsterActive
0,Tipster A,24-07-2015,Ascot,Fredricka,Win,8.0,Lose,True
1,Tipster A,24-07-2015,Thirsk,Spend A Penny,Win,4.5,Lose,True
2,Tipster A,24-07-2015,York,Straightothepoint,Win,7.0,Lose,True
3,Tipster A,24-07-2015,Newmarket,Miss Inga Sock,Win,5.0,Lose,True
4,Tipster A,25-07-2015,Ascot,Peril,Win,4.33,Win,True


In [3]:
df.shape

(38248, 8)

In [4]:
count = 0
for i in df['Horse']:
    if i == "Fredricka":
        count = count + 1
print(count)

8


In [5]:
df.isna().sum()

Tipster          0
Date             0
Track            0
Horse            0
Bet Type         0
Odds             0
Result           0
TipsterActive    0
dtype: int64

In [6]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

In [7]:
df['date_year'] = df['Date'].dt.year

In [8]:
df['date_month_no'] = df['Date'].dt.month

In [9]:
df['date_day'] = df['Date'].dt.day

In [10]:
df['date_dow'] = df['Date'].dt.dayofweek

In [11]:
df['date_is_weekend'] = np.where(df['date_dow'].isin([0,1]), 1,0)

In [12]:
df

Unnamed: 0,Tipster,Date,Track,Horse,Bet Type,Odds,Result,TipsterActive,date_year,date_month_no,date_day,date_dow,date_is_weekend
0,Tipster A,2015-07-24,Ascot,Fredricka,Win,8.00,Lose,True,2015,7,24,4,0
1,Tipster A,2015-07-24,Thirsk,Spend A Penny,Win,4.50,Lose,True,2015,7,24,4,0
2,Tipster A,2015-07-24,York,Straightothepoint,Win,7.00,Lose,True,2015,7,24,4,0
3,Tipster A,2015-07-24,Newmarket,Miss Inga Sock,Win,5.00,Lose,True,2015,7,24,4,0
4,Tipster A,2015-07-25,Ascot,Peril,Win,4.33,Win,True,2015,7,25,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38243,Tipster E1,2016-04-02,Kempton,Solar Flair,Win,7.00,Lose,False,2016,4,2,5,0
38244,Tipster E1,2016-04-02,Doncaster,Express Himself,Each Way,12.00,Lose,False,2016,4,2,5,0
38245,Tipster E1,2016-04-02,Doncaster,Jack Dexter,Win,7.00,Lose,False,2016,4,2,5,0
38246,Tipster E1,2016-04-02,Kelso,Just Cameron,Win,4.33,Lose,False,2016,4,2,5,0


In [13]:
df = df.drop(columns=['Date'],axis=1)

In [14]:
df

Unnamed: 0,Tipster,Track,Horse,Bet Type,Odds,Result,TipsterActive,date_year,date_month_no,date_day,date_dow,date_is_weekend
0,Tipster A,Ascot,Fredricka,Win,8.00,Lose,True,2015,7,24,4,0
1,Tipster A,Thirsk,Spend A Penny,Win,4.50,Lose,True,2015,7,24,4,0
2,Tipster A,York,Straightothepoint,Win,7.00,Lose,True,2015,7,24,4,0
3,Tipster A,Newmarket,Miss Inga Sock,Win,5.00,Lose,True,2015,7,24,4,0
4,Tipster A,Ascot,Peril,Win,4.33,Win,True,2015,7,25,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
38243,Tipster E1,Kempton,Solar Flair,Win,7.00,Lose,False,2016,4,2,5,0
38244,Tipster E1,Doncaster,Express Himself,Each Way,12.00,Lose,False,2016,4,2,5,0
38245,Tipster E1,Doncaster,Jack Dexter,Win,7.00,Lose,False,2016,4,2,5,0
38246,Tipster E1,Kelso,Just Cameron,Win,4.33,Lose,False,2016,4,2,5,0


In [15]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Result']),df['Result'],test_size=0.2,random_state=42)

In [16]:
X_train

Unnamed: 0,Tipster,Track,Horse,Bet Type,Odds,TipsterActive,date_year,date_month_no,date_day,date_dow,date_is_weekend
4812,Tipster E,Yarmouth,Reaffirmed,Win,5.0,True,2014,6,26,3,0
34052,Tipster B1,Ascot,Polarisation,Win,9.0,False,2015,9,5,5,0
23400,Tipster X,Taunton,Elsafeer,Win,7.0,True,2013,11,28,3,0
21464,Tipster T,Newmarket,Riflescope,Win,26.0,True,2016,7,7,3,0
18200,Tipster R,Uttoxeter,Catching On,Win,5.0,True,2015,3,14,5,0
...,...,...,...,...,...,...,...,...,...,...,...
6265,Tipster E,Chepstow,Lac Sacre,Win,5.5,True,2015,5,19,1,1
11284,Tipster J,Shatin,Nuovo Record,Win,13.0,True,2016,4,24,6,0
38158,Tipster E1,SouthWell,Alco Sivola,Win,6.0,False,2015,10,22,3,0
860,Tipster A,Newmarket,Dougan,Win,7.5,True,2016,5,1,6,0


In [17]:
y_train

4812     Lose
34052    Lose
23400    Lose
21464    Lose
18200    Lose
         ... 
6265     Lose
11284    Lose
38158    Lose
860      Lose
15795    Lose
Name: Result, Length: 30598, dtype: object

In [18]:
df['Tipster'].unique()

array(['Tipster A', 'Tipster B', 'Tipster C', 'Tipster D', 'Tipster E',
       'Tipster F', 'Tipster G', 'Tipster H', 'Tipster I', 'Tipster J',
       'Tipster K', 'Tipster L', 'Tipster M', 'Tipster N', 'Tipster O',
       'Tipster P', 'Tipster Q', 'Tipster R', 'Tipster S', 'Tipster T',
       'Tipster U', 'Tipster V', 'Tipster W', 'Tipster X', 'Tipster Y',
       'Tipster Z', 'Tipster A1', 'Tipster B1', 'Tipster C1',
       'Tipster D1', 'Tipster E1'], dtype=object)

In [19]:
df['Track'].unique()

array(['Ascot', 'Thirsk', 'York', 'Newmarket', 'Newcastle', 'Lingfield',
       'Carlisle', 'Pontefract', 'Uttoxeter', 'Hamilton', 'Chepstow',
       'Salisbury', 'Catterick', 'Chelmsford', 'Brighton',
       'Wolverhampton', 'Newbury', 'Market Rasen', 'Tramore', 'Leicester',
       'Ffos Las', 'Worcester', 'Sandown', 'Deauville', 'Curragh',
       'Stratford', 'Goodwood', 'Beverley', 'Doncaster', 'Chester',
       'Leopardstown', 'Bath', 'Yarmouth', 'Ayr', 'Kempton', 'Haydock',
       'Nottingham', 'Dundalk', 'Sedgefield', 'Huntingdon', 'Wetherby',
       'Wincanton', 'Fakenham', 'Cork', 'Naas', 'Windsor', 'Exeter',
       'Galway', 'Navan', 'Ludlow', 'Cheltenham', 'Aintree', 'Listowel',
       'Limerick', 'Plumpton', 'Warwick', 'Thurles', 'Towcester',
       'Musselburgh', 'Hexham', 'Taunton', 'Punchestown', 'Fontwell',
       'SouthWell', 'Fairyhouse', 'Bangor', 'Tipperary', 'Longchamp',
       'Meydan', 'Gowran Park', 'Cartmel', 'Down Royal', 'Kelso',
       'Downpatrick', 'Ripon',

In [20]:
df['Horse'].unique()

array(['Fredricka', 'Spend A Penny', 'Straightothepoint', ...,
       'Connetable', 'Allchilledout', 'Dream Farr'], dtype=object)

In [21]:
df['Bet Type'].unique()

array(['Win', 'Each Way', 'win'], dtype=object)

In [22]:
df['Bet Type'] = df['Bet Type'].replace("win","Win")

In [23]:
df['TipsterActive'].unique()

array([ True, False])

In [24]:
df['Result'].unique()

array(['Lose', 'Win'], dtype=object)

In [25]:
from sklearn import preprocessing

label_encoder = preprocessing.LabelEncoder()
y_trans = label_encoder.fit_transform(y_train)

In [26]:
def encode_column(column):
    series_column = column.squeeze()  
    enc_column = series_column.map(series_column.value_counts() / len(series_column))
    return enc_column.to_frame()

In [27]:
!pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn -U

Looking in indexes: https://pypi.org/simple, https://pypi.anaconda.org/scipy-wheels-nightly/simple


In [28]:
from sklearn.compose import ColumnTransformer
trf1 = ColumnTransformer([
    ('Horse_encode', FunctionTransformer(encode_column, validate=False), ['Horse']),
    ('Track_encode', FunctionTransformer(encode_column, validate=False), ['Track']),
    ('Tipster_encode', FunctionTransformer(encode_column, validate=False), ['Tipster']),
    ('ohe',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[3,6]),    
], remainder ='passthrough').set_output(transform="pandas")

In [29]:
from sklearn.preprocessing import StandardScaler
trf2 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,13))
])

In [30]:
clf = LogisticRegression()

In [31]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('clf',clf)
])

In [32]:
pipe.fit(X_train,y_train)

In [33]:
from sklearn import set_config
set_config(display='diagram')

In [34]:
y_pred = pipe.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.8005228758169934

In [36]:
y_pred1 = pipe.predict(X_train)

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,y_pred1)

0.79848356101706

In [38]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.7980587733268389

In [39]:
param_grid = {
    "clf__penalty": ['l1', 'l2'],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100]
}

In [40]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\DRITI SINGHANIA\anaconda3\lib\site-packages\sklearn\base.py", line 1474, i

In [41]:
from sklearn import set_config
set_config(display='diagram')

In [42]:
grid.best_score_

0.7986142914975

In [43]:
grid.best_params_

{'clf__C': 0.001, 'clf__penalty': 'l2'}

In [44]:
# export 
import pickle
pickle.dump(pipe,open('lr_pipe5.pkl','wb'))