In [1]:
!ls

cat_kfolds.ipynb  cat_train.csv        Untitled.ipynb
cat_test.csv	  cat_train_folds.csv


In [None]:
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing

def run(fold):
    
    df = pd.read_csv("cat_train_folds.csv")
    
    features =[
        f for f in df.columns if f not in ("id","kfold","target")
    ]
    
    # fill NaN values with NONE. Not that all columns are being converted to string as they are categories.
    for col in features:
        df.loc[:,col] = df[col].astype(str).fillna("NONE")
        
    # get training data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using folds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # initialize OHE from scikit-learn
    ohe = preprocessing.OneHotEncoder()
    
    # fot OHE on training+ validation features
    full_data = pd.concat(
        [df_train[features],df_valid[features]],
        axis=0
    )
    ohe.fit(full_data[features])
    
    # transform training data
    x_train = ohe.transform(df_train[features])
    
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    # initialize logistic regression model
    model = linear_model.LogisticRegression()
    
    # fit model on training data (ohe)
    model.fit(x_train,df_train.target.values)
    
    #predict on validation data
    # we need the probability values as we are calcuating AUC
    # we will use the probability of 1s
    valid_preds = model.predict_proba(x_valid)[:,1]
    
    # get roc auc score
    auc = metrics.roc_auc_score(df_valid.target.values,valid_preds)
    
    # print auc
    print(auc)
    
if __name__ == "__main__":
    # run function for fold = 0
    run(0)