In [19]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn import metrics
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-parquet/test_data.parquet
/kaggle/input/amex-parquet/train_data.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


In [None]:
df1=pd.read_parquet('/kaggle/input/amex-parquet/train_data.parquet')
df1.head()
df1.shape
df_label = pd.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
df_label.head()

In [None]:
df1.head()

In [None]:
#checking out data types
print(df1.dtypes.value_counts())
list(df1.select_dtypes(['object']).columns)

In [None]:
#visualizing columns

#hist = df1.hist(bins=10, figsize = (40,200), layout=(-1,4) )

In [None]:
print(df1.shape[1])
#inspecting NaN
for i in range(len(df1.columns)):
    if (df1.iloc[:,i].isnull().sum()/len(df1) > 0.1):
        print(df1.columns[i], round(df1.iloc[:,i].isnull().sum()/len(df1),2))

#drop columns with high freq of NaN
columns_to_drop = [column for column in df1.columns if df1[column].isnull().sum()/len(df1) >= 0.1]
df1.drop(columns_to_drop, axis=1, inplace=True)
print(df1.shape[1])

In [None]:
#using only most recent transaction from each customer
df1.shape
df1=df1.set_index(['customer_ID'])
df1=df1.ffill()
df1=df1.bfill()
df1=df1.reset_index()

df1=df1.groupby('customer_ID').tail(1)
df1=df1.set_index(['customer_ID'])

#Drop date column since it is no longer useful
df1.drop(['S_2'],axis=1,inplace=True)

df1.shape
df1.head()

In [None]:
print(df1.shape)
# Create correlation matrix
corr_matrix = df1.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.76)]

# Drop features w/ high correl
df1.drop(to_drop, axis=1, inplace=True)

print(df1.shape)
df1.head()

In [None]:
#Removing low variance columns in interest of ram
from sklearn.feature_selection import VarianceThreshold
from itertools import compress

temp = df1.drop(['D_63', 'D_64'], axis=1)

# Initialize and fit the method
vt = VarianceThreshold(threshold = float(0.1))
vt.fit(temp)

#columns with sufficient variance
keep = list(compress(temp.columns, vt.get_support()))

keep.append('D_63')
keep.append('D_64')

df1=df1[keep]
len(keep)


In [None]:
df1.head()
#df1 = df1.drop('target')
keep = df1.columns

In [None]:
#removing outliers
##print(df1.shape)

#df1 = df1[df1['R_6'] < df1['R_6'].quantile(0.97)]
#print(df1['R_6'].max())
#print(df1.shape)

In [None]:
#df1.iloc[:100000,7].value_counts()
#print(df1.iloc[:,1].head())


#What type of variable for dates
#df1['S_2'] = pd.to_datetime(df1['S_2'])
#df1['S_2'] = pd.to_numeric(df1['S_2'])

#normalizing
#df1['S_2'] = (df1['S_2']-df1['S_2'].min())/(df1['S_2'].max() - df1['S_2'].min())
#print(df1['S_2'].head())

#df1['S_2'] = pd.to_timedelta(df1['S_2'])
#print(df1.iloc[:,1].dt.total_seconds())

In [None]:
#Hot ones
df1 = pd.get_dummies(df1)

In [None]:
#Handling missing values
my_imputer = SimpleImputer()
df1.iloc[:,:] = my_imputer.fit_transform(df1.iloc[:,:])

In [None]:
X = df1.iloc[:, :].values.reshape(-1, len(df1.columns))
Y = df_label.iloc[:len(df1), 1].values.reshape(-1, 1)
print('half')
# create object for the class
log = LogisticRegression()
log.fit(X, Y) 
Y_pred = log.predict(X)

print(Y_pred, np.sum(Y_pred))
print(log.score(X, Y))




In [None]:
cm = metrics.confusion_matrix(Y, Y_pred)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'YlGnBu');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(round(log.score(X, Y),3))
plt.title(all_sample_title, size = 15);

print('Accuracy:',round(metrics.accuracy_score(Y, Y_pred),3))
metrics.roc_curve(Y, Y_pred)
print(metrics.roc_auc_score(Y, Y_pred))

In [None]:
#free up ram
del df1, df_label, cm
gc.collect()

In [None]:
#run prediction on test data

print(len(keep))
#need to only load some columns due to ram limitations
df2=pd.read_parquet('/kaggle/input/amex-parquet/test_data.parquet', columns =keep)
print(df2.shape)

#hot ones
df2 = pd.get_dummies(df2)

In [None]:
#Handling missing values
df2.iloc[:,:11] = my_imputer.fit_transform(df2.iloc[:,:11])
print('1')
df2.iloc[:,11:19] = my_imputer.fit_transform(df2.iloc[:,11:19])
print('2')
df2.iloc[:,19:30] = my_imputer.fit_transform(df2.iloc[:,19:30])
df2.iloc[:,30:] = my_imputer.fit_transform(df2.iloc[:,30:])

In [None]:
X = df2.iloc[:, :].values.reshape(-1, len(df2.columns))
Y_pred2 = log.predict(X)

print(Y_pred2, np.sum(Y_pred2))