Our goal is to use this industrial scale dataset to predict if a customer will default in the future. We aim to put together a quick and rough solution.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn import metrics
import gc

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amex-parquet/test_data.parquet
/kaggle/input/amex-parquet/train_data.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


This data set is too large to be loaded as a csv file. We convert the data from csv to parquet and then load the parquet format into a pandas dataframe. Next, we take a preview of our data.

In [1]:
import pandas as pd
df=pd.read_parquet('/kaggle/input/amex-parquet/train_data.parquet')
print(df.head())

                                         customer_ID         S_2       P_2  \
0  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-03-09  0.938469   
1  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-04-07  0.936665   
2  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-05-28  0.954180   
3  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-06-13  0.960384   
4  0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...  2017-07-16  0.947248   

       D_39       B_1       B_2       R_1       S_3      D_41       B_3  ...  \
0  0.001733  0.008724  1.006838  0.009228  0.124035  0.008771  0.004709  ...   
1  0.005775  0.004923  1.000653  0.006151  0.126750  0.000798  0.002714  ...   
2  0.091505  0.021655  1.009672  0.006815  0.123977  0.007598  0.009423  ...   
3  0.002455  0.013683  1.002700  0.001373  0.117169  0.000685  0.005531  ...   
4  0.002483  0.015193  1.000727  0.007605  0.117325  0.004653  0.009312  ...   

   D_137  D_138     D_139     D_140     D_141  D_1

We have about 5.5 million rows and 191 columns. There are a few columns that contain NaN values. We also note the customer ID and the date column. Lets split our labels from our dataset.

In [3]:
print(len(df.columns))
df_label = df.iloc[:,len(df.columns)-1]
df = df.iloc[:,0:len(df.columns)-1]
print(df_label)

191
0          0
1          0
2          0
3          0
4          0
          ..
5531446    0
5531447    0
5531448    0
5531449    0
5531450    0
Name: target, Length: 5531451, dtype: int64


Our dataframe contains over 5.5 million rows. It seems likely we will need to reduce the degrees of freedom due to our hardware limitations. Now lets take a look at our data types.

In [None]:
#inspecting data types
print(df.dtypes.value_counts())

There are four non-numeric data types. Let's convert these in order to use them in logistic regression model.

In [5]:
#visualizing columns

#hist = df1.hist(bins=10, figsize = (40,200), layout=(-1,4) )

Let's find the columns with a significant amount of NaN values. 

In [4]:
#inspecting NaN
temp = [column for column in df.columns if df[column].isnull().sum()/len(df) >= 0.9]
print(len(temp))

#drop columns with high freq of NaN
df.drop(temp, axis=1, inplace=True)

print( len(df.columns) )

18
173


In [8]:
#using only most recent transaction from each customer
temp = df.shape
df=df.set_index(['customer_ID'])
df=df.ffill()
df=df.bfill()
df=df.reset_index()

df=df.groupby('customer_ID').tail(1)
df=df.set_index(['customer_ID'])

#Drop date column since it is no longer useful
df.drop(['S_2'],axis=1,inplace=True)

print(temp, df.shape)

#inspecting NaN
print('Columns left with NaN:')
temp = [column for column in df.columns if df[column].isnull().sum()/len(df) >0]
print(len(temp))

(5531451, 173) (458913, 171)
Columns left with NaN:


In [9]:
df.head()
df = df.drop(['target'], axis=1)
keep = df.columns

In [10]:
print(df1.shape)
# Create correlation matrix
corr_matrix = df1.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop features w/ high correl
df1.drop(to_drop, axis=1, inplace=True)

print(df1.shape)


(458913, 170)
(458913, 156)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [11]:
#Removing low variance columns in interest of ram
from sklearn.feature_selection import VarianceThreshold
from itertools import compress

temp = df1.drop(['D_63', 'D_64'], axis=1)

# Initialize and fit the method
vt = VarianceThreshold(threshold = float(0.1))
vt.fit(temp)

#columns with sufficient variance
keep = list(compress(temp.columns, vt.get_support()))

keep.append('D_63')
keep.append('D_64')

df1=df1[keep]

keep.append('customer_ID')
keep.append('S_2')
len(keep)

58

In [12]:
""""
#removing outliers
print(df1.shape)

df1 = df1[df1['R_6'] < df1['R_6'].quantile(0.97)]
print(df1['R_6'].max())
print(df1.shape)"""

'"\n#removing outliers\nprint(df1.shape)\n\ndf1 = df1[df1[\'R_6\'] < df1[\'R_6\'].quantile(0.97)]\nprint(df1[\'R_6\'].max())\nprint(df1.shape)'

In [13]:
"""#df1.iloc[:100000,7].value_counts()
print(df1.iloc[:,1].head())


#What type of variable for dates
df1['S_2'] = pd.to_datetime(df1['S_2'])
df1['S_2'] = pd.to_numeric(df1['S_2'])

#normalizing
#df1['S_2'] = (df1['S_2']-df1['S_2'].min())/(df1['S_2'].max() - df1['S_2'].min())
print(df1['S_2'].head())

df1['S_2'] = pd.to_timedelta(df1['S_2'])
print(df1.iloc[:,1].dt.total_seconds())
"""

"#df1.iloc[:100000,7].value_counts()\nprint(df1.iloc[:,1].head())\n\n\n#What type of variable for dates\ndf1['S_2'] = pd.to_datetime(df1['S_2'])\ndf1['S_2'] = pd.to_numeric(df1['S_2'])\n\n#normalizing\n#df1['S_2'] = (df1['S_2']-df1['S_2'].min())/(df1['S_2'].max() - df1['S_2'].min())\nprint(df1['S_2'].head())\n\ndf1['S_2'] = pd.to_timedelta(df1['S_2'])\nprint(df1.iloc[:,1].dt.total_seconds())\n"

In [14]:
#Hot ones
df1 = pd.get_dummies(df1)
print(df1.shape)
print(df1.columns)
print(df1['D_64_-1'].sum())
df1.drop(['D_64_-1'], axis=1, inplace = True)
print(df1.shape)

(458913, 64)
Index(['D_39', 'B_2', 'B_5', 'D_48', 'B_6', 'B_8', 'D_50', 'B_10', 'S_5',
       'S_6', 'B_12', 'R_5', 'D_60', 'D_61', 'D_65', 'B_16', 'B_17', 'B_18',
       'B_20', 'D_68', 'R_6', 'B_21', 'D_69', 'D_71', 'P_4', 'B_24', 'R_7',
       'B_26', 'D_78', 'R_8', 'S_16', 'R_10', 'D_81', 'R_14', 'D_84', 'B_30',
       'R_20', 'S_22', 'S_23', 'S_26', 'D_103', 'R_27', 'B_38', 'D_112',
       'B_40', 'S_27', 'D_114', 'D_117', 'D_120', 'D_126', 'D_128', 'D_129',
       'D_130', 'D_139', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM',
       'D_63_XZ', 'D_64_-1', 'D_64_O', 'D_64_R', 'D_64_U'],
      dtype='object')
1865
(458913, 63)


In [15]:
#Handling missing values
#my_imputer = SimpleImputer()
#df1.iloc[:,:] = my_imputer.fit_transform(df1.iloc[:,:])

In [16]:
X = df1.iloc[:, :].values.reshape(-1, len(df1.columns))
Y = df_label.iloc[:len(df1), 1].values.reshape(-1, 1)

In [17]:
"""
# create object for the class
log = LogisticRegression()
log.fit(X, Y) 
Y_pred = log.predict(X)

print(Y_pred, np.sum(Y_pred))
print(log.score(X, Y))"""

'\n# create object for the class\nlog = LogisticRegression()\nlog.fit(X, Y) \nY_pred = log.predict(X)\n\nprint(Y_pred, np.sum(Y_pred))\nprint(log.score(X, Y))'

In [18]:

#trying random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

model = RandomForestClassifier(n_estimators=400, max_features='sqrt', bootstrap=True, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_jobs=-1)
#rf_random = GridSearchCV(estimator = rf, param_grid = random_grid, cv = 3, verbose=1, n_jobs = -1)
# Fit the random search model
model.fit(X,Y)



RandomForestClassifier(max_depth=30, max_features='sqrt', min_samples_split=5,
                       n_estimators=4, n_jobs=-1)

In [19]:
Y_pred = model.predict_proba(X)
Y_pred = Y_pred[:,1]
print(Y_pred.shape, Y_pred[25:50])

(458913,) [9.90654206e-01 5.00000000e-01 1.00000000e+00 3.97749028e-03
 1.66666667e-01 3.07503075e-04 3.62647544e-03 4.84404970e-04
 2.59262781e-03 1.00000000e+00 5.22629874e-05 9.00211013e-04
 7.50000000e-01 1.00000000e+00 1.53787027e-03 0.00000000e+00
 0.00000000e+00 1.49700599e-03 9.73774068e-04 2.35992354e-03
 1.00000000e+00 3.90719217e-03 8.57142857e-01 1.11234220e-03
 1.59841370e-03]


In [20]:
"""from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

param_random_gb = {'learning_rate': np.arange(0.05,0.55, 0.1), 'n_estimators' : [125,150,175], 'subsample' : np.arange(0.3,1.0, 0.1), 'max_depth':[3,4,5]}

mse_random = RandomizedSearchCV(estimator = XGBClassifier(), param_distributions = param_random_gb, n_iter = 10,scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)



mse_random.best_params_={'subsample': 0.5, 'n_estimators': 175, 'max_depth': 3, 'learning_rate': 0.15}
mse_random.best_score_ = (0.32263831733224874)**2

mse_random.fit(X,Y)

#Run XGBoost model with the best parameters found
model=XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.15, subsample=0.5)
model.fit(x_train_split,y_train_split)
#Test the model
y_predict=model.predict(x_test_split)
print('XGBoost Classifier Accuracy: {:.3f}'.format(accuracy_score(y_test_split, y_predict)))"""

"from xgboost import XGBClassifier\nfrom sklearn.model_selection import RandomizedSearchCV\n\nparam_random_gb = {'learning_rate': np.arange(0.05,0.55, 0.1), 'n_estimators' : [125,150,175], 'subsample' : np.arange(0.3,1.0, 0.1), 'max_depth':[3,4,5]}\n\nmse_random = RandomizedSearchCV(estimator = XGBClassifier(), param_distributions = param_random_gb, n_iter = 10,scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)\n\n\n\nmse_random.best_params_={'subsample': 0.5, 'n_estimators': 175, 'max_depth': 3, 'learning_rate': 0.15}\nmse_random.best_score_ = (0.32263831733224874)**2\n\nmse_random.fit(X,Y)\n\n#Run XGBoost model with the best parameters found\nmodel=XGBClassifier(n_estimators=200,max_depth=3,learning_rate=0.15, subsample=0.5)\nmodel.fit(x_train_split,y_train_split)\n#Test the model\ny_predict=model.predict(x_test_split)\nprint('XGBoost Classifier Accuracy: {:.3f}'.format(accuracy_score(y_test_split, y_predict)))"

In [21]:
"""def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
accuracy = evaluate(model, X, Y)

print('Accuracy:{:0.2f}%.'.format( 100 * accuracy))"""

"def evaluate(model, test_features, test_labels):\n    predictions = model.predict(test_features)\n    errors = abs(predictions - test_labels)\n    mape = 100 * np.mean(errors / test_labels)\n    accuracy = 100 - mape\n    print('Model Performance')\n    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))\n    print('Accuracy = {:0.2f}%.'.format(accuracy))\n    \naccuracy = evaluate(model, X, Y)\n\nprint('Accuracy:{:0.2f}%.'.format( 100 * accuracy))"

In [22]:
""""cm = metrics.confusion_matrix(Y, Y_pred)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'YlGnBu');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(round(log.score(X, Y),3))
plt.title(all_sample_title, size = 15);

print('Accuracy:',round(metrics.accuracy_score(Y, Y_pred),3))
metrics.roc_curve(Y, Y_pred)
print(metrics.roc_auc_score(Y, Y_pred))"""

'"cm = metrics.confusion_matrix(Y, Y_pred)\n\nplt.figure(figsize=(9,9))\nsns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = \'YlGnBu\');\nplt.ylabel(\'Actual label\');\nplt.xlabel(\'Predicted label\');\nall_sample_title = \'Accuracy Score: {0}\'.format(round(log.score(X, Y),3))\nplt.title(all_sample_title, size = 15);\n\nprint(\'Accuracy:\',round(metrics.accuracy_score(Y, Y_pred),3))\nmetrics.roc_curve(Y, Y_pred)\nprint(metrics.roc_auc_score(Y, Y_pred))'

In [23]:
#free up ram
del df1, df_label
gc.collect()

163

In [24]:
#run prediction on test data

print(len(keep))
#need to only load some columns due to ram limitations
df2=pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet', columns =keep)
print(df2.shape)

58
(11363762, 58)


In [25]:
df2.head()

Unnamed: 0,D_39,B_2,B_5,D_48,B_6,B_8,D_50,B_10,S_5,S_6,...,D_120,D_126,D_128,D_129,D_130,D_139,D_63,D_64,customer_ID,S_2
0,0.001912,0.814497,0.009517,0.626467,0.174591,1.003925,,-0.002919,0.002771,1.006875,...,,0.0,,,,,CR,,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19
1,0.005275,0.810848,0.026313,0.611682,0.177115,1.009301,,0.031222,0.006497,1.005728,...,,0.0,0.008782,0.005579,0.007598,0.000142,CR,,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25
2,0.003326,1.00462,0.060017,0.622228,1.417059,1.004635,,0.035532,0.003407,0.006408,...,,0.0,0.005602,0.009336,0.003608,7.4e-05,CR,,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25
3,0.009065,0.816549,0.0034,0.61562,0.04462,1.009093,,0.043155,0.007204,0.005841,...,,0.0,0.00869,0.006479,0.006133,0.004743,CR,,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20
4,0.238794,0.810456,0.141991,0.591795,0.039586,1.006634,,0.063187,0.00862,0.002622,...,1.0,0.0,0.005304,0.002027,0.006975,0.008133,CR,U,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15


In [26]:
#Handling missing values via imputation
#df2.iloc[:,:] = my_imputer.fit_transform(df2.iloc[:,:])

In [27]:
#using only most recent transaction from each customer
#Does this make sense for test data??
temp = df2.shape
df2=df2.set_index(['customer_ID'])
df2=df2.ffill()
df2=df2.bfill()
df2=df2.reset_index()

df2=df2.groupby('customer_ID').tail(1)
df2=df2.set_index(['customer_ID'])

#Drop date column since it is no longer useful
df2.drop(['S_2'],axis=1,inplace=True)

print(temp, df2.shape)

#inspecting NaN
print('Columns left with NaN:')
for i in range(len(df2.columns)):
    if (df2.iloc[:,i].isnull().sum()/len(df2) > 0):
        print(df2.columns[i], round(df2.iloc[:,i].isnull().sum()/len(df2),2))

(11363762, 58) (924621, 56)
Columns left with NaN:


In [28]:
#hot ones
df2 = pd.get_dummies(df2)
print(df2.shape)
print(df2.columns)

(924621, 63)
Index(['D_39', 'B_2', 'B_5', 'D_48', 'B_6', 'B_8', 'D_50', 'B_10', 'S_5',
       'S_6', 'B_12', 'R_5', 'D_60', 'D_61', 'D_65', 'B_16', 'B_17', 'B_18',
       'B_20', 'D_68', 'R_6', 'B_21', 'D_69', 'D_71', 'P_4', 'B_24', 'R_7',
       'B_26', 'D_78', 'R_8', 'S_16', 'R_10', 'D_81', 'R_14', 'D_84', 'B_30',
       'R_20', 'S_22', 'S_23', 'S_26', 'D_103', 'R_27', 'B_38', 'D_112',
       'B_40', 'S_27', 'D_114', 'D_117', 'D_120', 'D_126', 'D_128', 'D_129',
       'D_130', 'D_139', 'D_63_CL', 'D_63_CO', 'D_63_CR', 'D_63_XL', 'D_63_XM',
       'D_63_XZ', 'D_64_O', 'D_64_R', 'D_64_U'],
      dtype='object')


In [29]:
df2.head()

Unnamed: 0_level_0,D_39,B_2,B_5,D_48,B_6,B_8,D_50,B_10,S_5,S_6,...,D_139,D_63_CL,D_63_CO,D_63_CR,D_63_XL,D_63_XM,D_63_XZ,D_64_O,D_64_R,D_64_U
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.121385,1.009347,0.050187,0.517214,0.024945,1.00873,0.007209,0.033599,0.011795,0.003473,...,0.005912,0,0,1,0,0,0,0,0,1
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.126475,1.009245,0.135907,0.041712,0.18272,1.00864,0.007209,0.298735,0.009336,0.005253,...,0.004344,0,1,0,0,0,0,1,0,0
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.002724,0.810072,0.004851,0.522954,0.058534,1.009375,0.007209,0.129189,0.004455,0.001847,...,1.001246,0,0,1,0,0,0,0,0,1
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.324828,0.205678,0.022947,0.60252,0.023546,0.009775,0.007209,0.032124,0.320184,0.008872,...,1.008246,1,0,0,0,0,0,0,1,0
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.768016,0.038021,0.011126,0.959607,0.011244,1.0062,0.039149,0.022026,0.064127,0.009401,...,0.006623,0,1,0,0,0,0,0,1,0


In [30]:
X = df2.iloc[:, :].values.reshape(-1, len(df2.columns))

In [31]:
"""X = df2.iloc[:, :].values.reshape(-1, len(df2.columns))

Y_pred2 = log.predict(X)
print(Y_pred2)

df2 = df2.reset_index()

final = pd.DataFrame({"customer_ID":df2.customer_ID,"prediction":Y_pred2})

final.to_csv('submission.csv', index=False)
print(Y_pred2, np.sum(Y_pred2))

#score ended as ~50"""

'X = df2.iloc[:, :].values.reshape(-1, len(df2.columns))\n\nY_pred2 = log.predict(X)\nprint(Y_pred2)\n\ndf2 = df2.reset_index()\n\nfinal = pd.DataFrame({"customer_ID":df2.customer_ID,"prediction":Y_pred2})\n\nfinal.to_csv(\'submission.csv\', index=False)\nprint(Y_pred2, np.sum(Y_pred2))\n\n#score ended as ~50'

In [32]:
Y_pred2 = model.predict_proba(df2)
Y_pred2 = Y_pred2[:,1]
df2 = df2.reset_index()
print(Y_pred2, np.sum(Y_pred2))
final = pd.DataFrame({"customer_ID":df2.customer_ID,"prediction":Y_pred2})
print(final)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


[0.25       0.00109822 0.25134409 ... 0.5        0.41666667 0.        ] 237275.30707351278
                                              customer_ID  prediction
0       00000469ba478561f23a92a868bd366de6f6527a684c9a...    0.250000
1       00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...    0.001098
2       0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...    0.251344
3       00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...    0.400862
4       00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...    0.486111
...                                                   ...         ...
924616  ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...    0.583333
924617  ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...    0.500000
924618  ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...    0.500000
924619  ffffddef1fc3643ea179c93245b68dca0f36941cd83977...    0.416667
924620  fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...    0.000000

[924621 rows x 2 columns]


In [33]:
final.to_csv('submission.csv', index=False)
print(final)

                                              customer_ID  prediction
0       00000469ba478561f23a92a868bd366de6f6527a684c9a...    0.250000
1       00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...    0.001098
2       0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...    0.251344
3       00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...    0.400862
4       00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...    0.486111
...                                                   ...         ...
924616  ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...    0.583333
924617  ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...    0.500000
924618  ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...    0.500000
924619  ffffddef1fc3643ea179c93245b68dca0f36941cd83977...    0.416667
924620  fffffa7cf7e453e1acc6a1426475d5cb9400859f82ff61...    0.000000

[924621 rows x 2 columns]
