In [101]:
import saspy
sas_session = saspy.SASsession()
sas_session

Using SAS Config named: oda
SAS Connection established. Subprocess id is 2366



Access Method         = IOM
SAS Config name       = oda
SAS Config file       = /home/armando/.virtualenvs/school-54vk9BCB/lib/python3.8/site-packages/saspy/sascfg_personal.py
WORK Path             = /saswork/SAS_work9170000187CB_odaws03-usw2.oda.sas.com/SAS_work6509000187CB_odaws03-usw2.oda.sas.com/
SAS Version           = 9.04.01M6P11072018
SASPy Version         = 4.4.0
Teach me SAS          = False
Batch                 = False
Results               = Pandas
SAS Session Encoding  = utf-8
Python Encoding value = utf-8
SAS process Pid value = 100299


In [102]:
%%SAS sas_session
libname cortex '~/my_shared_file_links/u39842936/Cortex Data Sets';


In [103]:
from scipy.stats import pearsonr
import pandas as pd
import random 
from scipy.stats import describe, pearsonr, zscore, f_oneway, yeojohnson, shapiro, probplot, levene
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

def explore(df: pd.DataFrame) -> pd.DataFrame:
    ex1, ex2, ex3 = random.sample(range(len(df)), 3)

    print("Dataframe total rows: ", len(df))
    df_info = pd.DataFrame(data = df.dtypes)
    not_missing_values_total = df.notnull().sum()
    not_missing_values_percent = round(not_missing_values_total/len(df)*100,2).astype(str)+" %"

    return pd.concat([df_info[0].rename("Data Type"),\
            df.T[ex1].rename("Example 1"),\
            df.T[ex2].rename("Example 2"),\
            df.T[ex3].rename("Example 3"),\
            not_missing_values_total.rename("Total Not Missing"), \
            not_missing_values_percent.rename("% of not missing values")], axis=1)
    
def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    """ Perform a forward-backward feature selection 
    based on p-value from statsmodels.api.OLS
    Arguments:
        X - pandas.DataFrame with candidate features
        y - list-like with the target
        initial_list - list of features to start with (column names of X)
        threshold_in - include a feature if its p-value < threshold_in
        threshold_out - exclude a feature if its p-value > threshold_out
        verbose - whether to print the sequence of inclusions and exclusions
    Returns: list of selected features 
    Always set threshold_in < threshold_out to avoid infinite looping.
    See https://en.wikipedia.org/wiki/Stepwise_regression for the details
    """
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded, dtype='float64')
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included    

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

In [104]:
#data1 = sas_session.sasdata2dataframe(
#table='hist',
#libref='cortex'
#)

#data2 = sas_session.sasdata2dataframe(
#table='target_rd1',
#libref='cortex'
#)

## Merge the Data

In [105]:
#data_merge = pd.merge(data1, data2, on=["ID"],how="right")
#data_merge.head()
#data_merge.to_csv("og.csv", index= False)
data_merge = pd.read_csv("og.csv")
explore(data_merge)

Dataframe total rows:  149457


Unnamed: 0,Data Type,Example 1,Example 2,Example 3,Total Not Missing,% of not missing values
ID,float64,2373126.0,2307578.0,2422205.0,149457,100.0 %
LastName,int64,22995.0,28401.0,15446.0,149457,100.0 %
FirstName,int64,2954.0,3576.0,2056.0,149457,100.0 %
Woman,float64,1.0,0.0,1.0,149457,100.0 %
Age,float64,55.0,61.0,19.0,149457,100.0 %
Salary,float64,43100.0,10700.0,212600.0,149457,100.0 %
Education,int64,0.0,0.0,2.0,149457,100.0 %
City,int64,1.0,0.0,0.0,149457,100.0 %
SeniorList,float64,5.0,1.0,3.0,149457,100.0 %
NbActivities,float64,0.0,0.0,0.0,149457,100.0 %


In [106]:
# display(describe(data_merge.select_dtypes(include ='number'), axis=0))
# display(data_merge.describe())

In [107]:
# for col in data_merge.select_dtypes(include ='number').columns:
#     print(col)
#     conteo = data_merge.groupby(by=col).size().reset_index(name="count")
#     moda = conteo.max().values
#     print(f"moda: {moda[0]} por haber {moda[1]} datos")
#     print(f"rango: {data_merge[col].max()-data_merge[col].min()}")
#     unique_col = list(data_merge[col].unique())
#     print("Unique elements qty: " + str(len(unique_col)))
#     print("Elements example:  " + str(unique_col[:10]) + "\n")
#     conteo.plot.scatter(x = col, y="count", figsize=[10,5])

In [108]:
# display(data_merge.select_dtypes(include ='object').describe(include = 'O').T)

In [109]:
# for col in data_merge.select_dtypes(include ='object').drop(["LastName", "FirstName"], axis=1).columns:
#     conteo = data_merge.groupby(by=col).size().reset_index(name="count")
#     conteo.plot.bar(x = col, y="count", figsize=[10,5])

In [110]:
# Missing Value - NOT SMART WAY
var_types = {"number" : "median", "object" : "mode"}
for var_type in list(var_types.keys()):
    for col in data_merge.select_dtypes(include=var_type).columns:
        data_merge[col] = data_merge[col].fillna(getattr(data_merge[col], var_types[var_type])())
            
for col in data_merge.select_dtypes(include="object"):
    data_merge[col] = data_merge[col].astype('category')
    data_merge[col] = data_merge[col].cat.codes
    

In [111]:
# stepwise_selection(data_merge.drop(["ID", "AmtThisYear", "GaveThisYear", "LastName", "FirstName"], axis=1), data_merge["AmtThisYear"] )

In [112]:
from sklearn.ensemble import RandomForestRegressor
# from sklearn.feature_selection import SelectFromModel
# sel = SelectFromModel(RandomForestRegressor())
# X = data_merge.drop(["ID", "AmtThisYear", "GaveThisYear", "LastName", "FirstName"], axis=1)
# sel.fit(X, data_merge["AmtThisYear"])

# importances = sel.estimator_.feature_importances_
# names = sel.estimator_.feature_names_in_

# indices = np.argsort(importances)[::-1]

# plt.figure(figsize=(10,5))
# plt.title("Feature importances")
# plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
# plt.xticks(range(X.shape[1]), [names[i] for i in indices])
# plt.xlim([-1, X.shape[1]])
# plt.show()

In [113]:
# print("Feature Importances")
# [names[i] for i in indices]

In [114]:
# import seaborn as sns

# corr = data_merge.drop(["ID", "GaveThisYear", "LastName", "FirstName"], axis=1).corr()
# sns.heatmap(corr,  xticklabels=corr.columns,  yticklabels=corr.columns)

In [115]:
# corr["AmtThisYear"].sort_values(ascending=False)

In [116]:
# calculate_pvalues(data_merge.drop(["ID", "GaveThisYear", "LastName", "FirstName"], axis=1))["AmtThisYear"].sort_values(ascending=True)

In [117]:
# data_merge.drop(["ID", "AmtThisYear"], axis=1).select_dtypes(include ='number').plot.kde(bw_method=1, subplots=True, layout=(6,6), sharex=False, figsize=[20,7])
# plt.show()

In [118]:
from sklearn.preprocessing import PowerTransformer

power = PowerTransformer(method='yeo-johnson', standardize=True)
X= data_merge.drop(["ID", "AmtThisYear"], axis=1)
X_cols = X.columns
X = power.fit_transform(X)
X = pd.DataFrame(X, columns = X_cols) 


In [119]:
# X.select_dtypes(include ='number').plot.kde(bw_method=1, subplots=True, layout=(6,6), sharex=False, figsize=[20,7])
# plt.show()

In [120]:
# print("Duplicates analysis considering Nans")
# duplicates = X.groupby(X.columns.tolist(), as_index=False, dropna=False).size()
# duplicates[duplicates["size"]>1].head(10)  # Size columns tells how many of them are

In [121]:
selection = [
'Salary',
'Age', 
'SeniorList',
'TotalGift',
'City',
'Woman',
'MinGift',
'NbActivities']

cols = selection +  ["AmtThisYear"] #['Age', 'Salary','MinGift', 'AmtLastYear','Woman', 'NbActivities', 'AmtThisYear']

In [122]:
data = X
data["AmtThisYear"] = data_merge["AmtThisYear"]


data = data[cols] 
data = data[(np.abs(zscore(data.select_dtypes(include ='number'))) < 2).all(axis=1)]

In [123]:
data.head()

Unnamed: 0,Salary,Age,SeniorList,TotalGift,City,Woman,MinGift,NbActivities,AmtThisYear
0,-0.547943,1.548348,-0.251501,0.007054,0.555585,0.913746,0.041151,-0.891909,20.0
1,0.752731,-0.756934,-0.591196,0.007054,1.165399,-1.094396,0.041151,-0.891909,30.0
2,0.555725,0.462704,-1.505526,0.007054,-1.237981,0.913746,0.041151,-0.891909,20.0
3,-0.988896,0.366613,-1.505526,0.007054,1.165399,0.913746,0.041151,-0.891909,75.0
4,1.964278,-0.208536,0.322972,0.007054,0.555585,0.913746,0.041151,-0.891909,20.0


In [124]:
len(data)

116040

In [125]:
# stepwise_selection(data.drop(["AmtThisYear"], axis=1), data["AmtThisYear"] )

In [126]:
# sel = SelectFromModel(RandomForestRegressor())
# X = data.drop(["AmtThisYear"], axis=1)
# sel.fit(X, data["AmtThisYear"])

# importances = sel.estimator_.feature_importances_
# names = sel.estimator_.feature_names_in_

# indices = np.argsort(importances)[::-1]

# plt.figure(figsize=(10,5))
# plt.title("Feature importances")
# plt.bar(range(X.shape[1]), importances[indices], color="r", align="center")
# plt.xticks(range(X.shape[1]), [names[i] for i in indices])
# plt.xlim([-1, X.shape[1]])
# plt.show()

## Data Partition

In [127]:
# The code below is an illustration on how to sample data on train and validation samples.
# You could use another library or a built-in function to perform sampling.

from sklearn.model_selection import train_test_split
train, validation = train_test_split(data, test_size=0.4, random_state=12345) 

#train.head()
train.sample(2)

Unnamed: 0,Salary,Age,SeniorList,TotalGift,City,Woman,MinGift,NbActivities,AmtThisYear
79849,-0.214727,-1.414913,-0.990957,0.007054,0.555585,0.913746,0.041151,0.698454,25.0
101910,0.296549,0.869547,-1.505526,0.007054,-1.237981,-1.094396,0.041151,-0.891909,25.0


In [128]:
from sklearn.model_selection import (GridSearchCV, KFold, cross_validate,)
from sklearn import decomposition
from sklearn.pipeline import Pipeline

## Prebuilt Models
***

### **Linear Regression Model**


> The [sk-learn library](https://scikit-learn.org/stable/index.html ) offers more advanced models.


In [129]:
# from sklearn import linear_model

# X_train = train.drop("AmtThisYear", axis=1)
# Y_train = train['AmtThisYear']
# X_valid = validation.drop("AmtThisYear", axis=1)
# Y_valid = validation['AmtThisYear']

# regr = linear_model.LinearRegression()
# regr.fit(X_train,Y_train)
# regr_predict=regr.predict(X_valid)

In [130]:
# #you can change the criteria

# import numpy as np
# from sklearn import metrics
# #MAE
# print(metrics.mean_absolute_error(Y_valid,regr_predict))
# #MSE
# print(metrics.mean_squared_error(Y_valid,regr_predict))
# #RMSE
# print(np.sqrt(metrics.mean_squared_error(Y_valid,regr_predict)))

## **logistic regression**

In [131]:
# from sklearn.linear_model import LogisticRegression

# X_train = train.drop("AmtThisYear", axis=1)
# Y_train = train['AmtThisYear']
# X_valid = validation.drop("AmtThisYear", axis=1)
# Y_valid = validation['AmtThisYear']

# LG_model = LogisticRegression().fit(X_train,Y_train)

# LG_predict = LG_model.predict(X_valid) #Predictions on Testing data

In [132]:
# #you can change the criteria
# #MAE
# print(metrics.mean_absolute_error(Y_valid,LG_predict))
# #MSE
# print(metrics.mean_squared_error(Y_valid,LG_predict))
# #RMSE
# print(np.sqrt(metrics.mean_squared_error(Y_valid,LG_predict)))

## **Regression Tree Model**

In [133]:
# from sklearn.tree import DecisionTreeRegressor

# X_train = train.drop("AmtThisYear", axis=1)
# Y_train = train['AmtThisYear']
# X_valid = validation.drop("AmtThisYear", axis=1)
# Y_valid = validation['AmtThisYear']

# DT_model = DecisionTreeRegressor(max_depth=5).fit(X_train,Y_train)

# DT_predict = DT_model.predict(X_valid) #Predictions on Testing data


In [134]:
# #you can change the criteria
# #MAE
# print(metrics.mean_absolute_error(Y_valid,DT_predict))
# #MSE
# print(metrics.mean_squared_error(Y_valid,DT_predict))
# #RMSE
# print(np.sqrt(metrics.mean_squared_error(Y_valid,DT_predict)))

## **random forest**

In [135]:
# X_train = train.drop("AmtThisYear", axis=1)
# Y_train = train['AmtThisYear']
# X_valid = validation.drop("AmtThisYear", axis=1)
# Y_valid = validation['AmtThisYear']

# RF_model = RandomForestRegressor().fit(X_train,Y_train)

# RF_predict = RF_model.predict(X_valid) #Predictions on Testing data

In [136]:
# #you can change the criteria
# #MAE
# print(metrics.mean_absolute_error(Y_valid,RF_predict))
# #MSE
# print(metrics.mean_squared_error(Y_valid,RF_predict))
# #RMSE
# print(np.sqrt(metrics.mean_squared_error(Y_valid,RF_predict)))

## **XGBOOST**

In [137]:
#!pip install xgboost

In [138]:
# from xgboost import XGBRegressor

X_train = train.drop("AmtThisYear", axis=1)
Y_train = train['AmtThisYear']
X_valid = validation.drop("AmtThisYear", axis=1)
Y_valid = validation['AmtThisYear']

# XG_model = XGBRegressor().fit(X_train,Y_train)

# XG_predict = XG_model.predict(X_valid) #Predictions on Testing data


In [139]:
# from sklearn import metrics
# #you can change the criteria
# #MAE
# print(metrics.mean_absolute_error(Y_valid,XG_predict))
# #MSE
# print(metrics.mean_squared_error(Y_valid,XG_predict))
# #RMSE
# print(np.sqrt(metrics.mean_squared_error(Y_valid,XG_predict)))

In [140]:
# !pip install optuna

In [141]:
import xgboost as xgb
import numpy as np
import optuna
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from optuna.samplers import TPESampler
from xgboost import XGBRegressor
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import RepeatedKFold
from optuna import create_study

def objective(
    trial,
    X,
    y,
    random_state=22,
    n_splits=3,
    n_repeats=2,
    n_jobs=1,
    early_stopping_rounds=100,
):
    # XGBoost parameters
    params = {
        "verbosity": 0,  # 0 (silent) - 3 (debug)
        "objective": "reg:squarederror",
        "n_estimators": 100,
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.00001, 0.01),
        "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
        "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
        "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
        "seed": random_state,
        "n_jobs": n_jobs,
    }

    model = XGBRegressor(**params)
    pruning_callback = XGBoostPruningCallback(trial, "validation_0-rmse")
    rkf = RepeatedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )
    X_values = X.values
    y_values = y.values
    y_pred = np.zeros_like(y_values)
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(
            X_A,
            y_A,
            eval_set=[(X_B, y_B)],
            eval_metric="rmse",
            verbose=0,
            callbacks=[pruning_callback],
            early_stopping_rounds=early_stopping_rounds,
        )
        y_pred[test_index] += model.predict(X_B)
    y_pred /= n_repeats
    return np.sqrt(mean_squared_error(Y_train, y_pred))

In [142]:
sampler = TPESampler(seed=124, multivariate=True)
study = create_study(direction="minimize", sampler=sampler)
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        Y_train,
        random_state=124,
        n_splits=10,
        n_repeats=1,
        n_jobs=12,
        early_stopping_rounds=100,
    ),
    n_trials=2,
    n_jobs=1,
)

# display params
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

[32m[I 2022-12-02 05:34:14,405][0m A new study created in memory with name: no-name-b0fdf36d-19ce-4fe9-831b-d0e10b6dbd26[0m
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.00001, 0.01),
  "colsample_bytree": trial.suggest_loguniform("colsample_bytree", 0.2, 0.6),
  "subsample": trial.suggest_loguniform("subsample", 0.4, 0.8),
  "alpha": trial.suggest_loguniform("alpha", 0.01, 10.0),
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 10.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
  "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
[32m[I 2022-12-02 05:34:28,778][0m Trial 0 finished with value: 75.66165357057007 and parameters: {'max_depth': 4, 'learning_rate': 0.0017235125679566851, 'colsample_bytree': 0.375053263116791, 'subsample': 0.5495463565688764, 'alpha': 0.14259940359422604, 'lambda': 2.9235044300356874e-06, 'gamma': 0.010272157854456487, 'min_child_weight': 61.794528315016734}. Best is trial 0 with value: 75.6616

           max_depth : 6
       learning_rate : 0.002186548505763493
    colsample_bytree : 0.38625183421563675
           subsample : 0.4355826100654535
               alpha : 0.1474762542832591
              lambda : 5.955184543677973
               gamma : 0.00016158093114227923
    min_child_weight : 19.06965408897461
best objective value : 74.8400737080225


In [143]:
hp["verbosity"] = 0
hp["objective"] = "reg:squarederror"
hp["n_estimators"] = 100
hp["seed"] = 124
hp["n_jobs"] = 12
model = XGBRegressor(**hp)
rkf = RepeatedKFold(n_splits=10, n_repeats=1, random_state=124)
X_values = X_train.values
y_values = Y_train.values
y_pred = np.zeros_like(Y_valid.values)
for train_index, test_index in rkf.split(X_values):
    X_A, X_B = X_values[train_index, :], X_values[test_index, :]
    y_A, y_B = y_values[train_index], y_values[test_index]
    model.fit(
        X_A,
        y_A,
        eval_set=[(X_B, y_B)],
        eval_metric="rmse",
        early_stopping_rounds=100,
        verbose=0,
    )
    y_pred += model.predict(X_valid.values)
y_pred /= 1 * 10



In [144]:
#you can change the criteria
#MAE
print(metrics.mean_absolute_error(Y_valid,y_pred))
#MSE
print(metrics.mean_squared_error(Y_valid,y_pred))
#RMSE
print(np.sqrt(metrics.mean_squared_error(Y_valid,y_pred)))

36.09127002596464
5671.264705954713
75.30779976838198


### **Other models may also be helpful for this game**

Reference: https://scikit-learn.org/stable/supervised_learning.html

***


## Scoring New Data

### Prepare data for scoring

In [145]:
#data3 = sas_session.sasdata2dataframe(
#table='score_rd1',
#libref='cortex'
#)
#data4 = sas_session.sasdata2dataframe(
#table='score',
#libref='cortex'
#)

 ### Score new data based on your champion model
 
> Pick your champion model from previous steps and use it to predict next year donations. 
 
> In this case, the linear regression model performed better than the regression tree based on the MSE criterion.

In [146]:
#scoring_data = pd.merge(data3, data4, on=["ID"],how="right")
#scoring_data.head()
#scoring_data.to_csv("score.csv", index= False)

scoring_data = pd.read_csv("score.csv")

In [147]:

# Perform the same strategy for handling missing values for the score dataset.
# In this case, we will only replace missing values of the MinGift variable.

var_types = {"number" : "median", "object" : "mode"}
for var_type in list(var_types.keys()):
    for col in scoring_data.select_dtypes(include=var_type).columns:
        scoring_data[col] = scoring_data[col].fillna(getattr(scoring_data[col], var_types[var_type])())

for col in scoring_data.select_dtypes(include="object"):
    scoring_data[col] = scoring_data[col].astype('category')
    scoring_data[col] = scoring_data[col].cat.codes

X = scoring_data[selection]


power = PowerTransformer(method='yeo-johnson', standardize=True)
X_cols = X.columns
X = power.fit_transform(X)
X = pd.DataFrame(X, columns = X_cols) 
X.head()

Unnamed: 0,Salary,Age,SeniorList,TotalGift,City,Woman,MinGift,NbActivities
0,0.891117,-1.164335,-0.673512,3.117426,-1.256986,-1.034466,-2.375866,-0.709013
1,-0.878737,-0.346822,-0.027035,-0.009389,0.503416,0.966683,-0.012385,1.253741
2,0.212797,-0.40417,0.254729,-0.009389,0.503416,0.966683,-0.012385,-0.709013
3,-0.574869,1.524672,-0.33334,-0.009389,0.503416,0.966683,-0.012385,-0.709013
4,0.444567,-0.581024,1.000481,-0.009389,0.503416,0.966683,-0.012385,-0.709013


In [148]:
# In this case, based on MSE (Mean Squared Error) criterion,
# the linear regression model performed better than the regression tree.

regr_predict_end=model.predict(X)

scoring_data['Prediction'] = regr_predict_end
scoring_data.sort_values(by=['Prediction'], inplace=True,ascending=False)
scoring_data.head()

Unnamed: 0,ID,GaveLastYear,AmtLastYear,LastName,FirstName,Woman,Age,Salary,Education,City,SeniorList,NbActivities,Referrals,Recency,Frequency,Seniority,TotalGift,MinGift,MaxGift,Prediction
326105,2326106.0,0.0,0.0,5097,467,1.0,30.0,69300.0,2,1,5.0,2.0,0.0,1.0,2.0,5.0,1525.0,25.0,1500.0,11.367042
211752,2211753.0,0.0,0.0,88504,116,1.0,37.0,69300.0,2,1,7.0,1.0,2.0,1.0,2.0,2.0,550.0,150.0,400.0,11.344551
968158,2968159.0,1.0,25.0,6622,2293,1.0,35.0,71400.0,2,0,6.0,2.0,1.0,0.0,6.0,6.0,1715.0,25.0,600.0,11.314064
689592,2689593.0,0.0,0.0,44691,3759,1.0,38.0,215400.0,2,1,4.0,1.0,0.0,0.0,1.0,0.0,75.0,75.0,75.0,11.309727
996839,2996840.0,0.0,0.0,75073,1927,1.0,36.0,69600.0,2,1,4.0,0.0,1.0,1.0,1.0,1.0,750.0,750.0,750.0,11.30183


## Exporting Results to a CSV File

In [149]:
Result= scoring_data[['ID']]
#Result.to_csv('Round1_Output.csv', index=False)

In [150]:
# Define your cutoff and choose a number of rows to submit to the leaderboard

NB = 90000
submission = Result.head(NB)
submission = submission.astype({'ID':'int'})
submission.to_csv('Round1 Output.csv', index=False)