In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn import base
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
feat_df = pd.read_csv('/kaggle/input/ipl-features-eng/matches_fat_eng.csv')
feat_df = feat_df.iloc[:,1:]
target = 'winner'
test_size =  0.2
cols_to_drop = ['date','id','result','dl_applied','win_by_runs','win_by_wickets','player_of_match']

In [None]:
feat_df_training = feat_df.drop(columns=cols_to_drop)
feat_df_training = feat_df_training.fillna(0,axis=0)
feat_df_training['population'] = feat_df_training['population']/10**6
feat_df_training.head()

In [None]:
def get_x_y(df,target):
    X,y = df.drop(columns=target),df[target]
    return X,y

> As we have a small dataset we have to make sure that the percentage of positive samples are equal across all the splits (train/dev/test)

In [None]:
def train_test_split_df(df,target,test_season,start_season=None):
    train = df[df['Season']<test_season]
    if start_season != None:
        train = train[train['Season']>start_season-1]
    test = df[df['Season']==test_season]
    X_train,y_train = get_x_y(train,target)
    X_test,y_test = get_x_y(test,target)
    return X_train,y_train,X_test,y_test

In [None]:
# def train_test_split(df,target,test_size):
#     positive_examples = df[df[target]== 1]
#     negative_examples = df[df[target] == 0]
    
#     train_p,test_p = train_test_split_df(positive_examples,test_size)
#     train_n,test_n = train_test_split_df(negative_examples,test_size)
    
#     train = pd.concat([train_p, train_n], ignore_index=True)
#     test = pd.concat([test_p, test_n], ignore_index=True)
#     return train,test
 

In [None]:
X_train,y_train,X_test,y_test = train_test_split_df(feat_df_training,target,2019)

> now let's implement cross validation using the split method above

In [None]:
def roll_over(df,target,model,loss_func,start_season,isclone=True):
    seas = start_season
    end_season = start_season+1
    final_season = max(df.Season)
    losses = []
    while end_season <final_season:
        end_season +=1 
        X_train,y_train,X_test,y_test = train_test_split_df(df,target,end_season,seas)
        model.fit(X_train,y_train)
        pred = model.predict_proba(X_test)[:, 1]
#         print(pred)
        losses.append(loss_func(y_test,pred))
        if isclone:
            model = base.clone(model)
        seas+=1 
    return np.average(losses)


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,solver='liblinear')


In [None]:
from sklearn.metrics import log_loss
loss_func = log_loss

In [None]:
roll_over(feat_df_training,target,clf,loss_func,2008)

In [None]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()

In [None]:
roll_over(feat_df_training,target,clf,loss_func,2008)

In [None]:
class Base_Model:
    def __init__(self):
        self.X_train = None
        self.y_train = None
    def fit(self,X_train,y_train):
        self.X_train = X_train
        self.y_train = y_train
    def predict_proba(self,X_test):
        df = self.X_train
        df['target'] = self.y_train
        probs = []
        for idx,row in X_test.iterrows():
            d = row['duals_sets']
            seas = row['Season']
#             print((d,seas))
            df_seas = df[(df['Season']==seas-1) & (df['duals_sets']==d)]
            if df_seas.shape[0]>0:
                probs.append([0,df_seas[df_seas['target']==1].shape[0]/df_seas.shape[0]])
            else:
                probs.append([0,0.5])
#         print(probs)
        return np.array(probs)


In [None]:
roll_over(feat_df_training,target,Base_Model(),loss_func,2008,False)

In [None]:
class Base_Model_2:
    def __init__(self):
        self.X_train = None
        self.y_train = None
    def fit(self,X_train,y_train):
        self.X_train = X_train
        self.y_train = y_train
    def predict_proba(self,X_test):
        df = self.X_train
        df['target'] = self.y_train
        probs = []
        for idx,row in X_test.iterrows():
            probs.append([0,0.5])
#         print(probs)
        return np.array(probs)


In [None]:
roll_over(feat_df_training,target,Base_Model_2(),loss_func,2008,False)

> So, by just predecting 0.5 prob every time we can get 0.69 log loss

In [None]:
pd.DataFrame({'Value':clf.feature_importances_,'Feature':clf.feature_name_}).sort_values(by="Value",ascending=False)

> We can get rid of all 