In [6]:
import pandas as pd
import seaborn as sb
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression

In [13]:
df = pd.read_csv('loan_test.csv', sep=';')
df = df[df.columns.difference(['status'])]

df_train = pd.read_csv('loan_train.csv', sep=';')
train_status = df_train['status']
train_data = df_train[df_train.columns.difference(['status'])]

In [14]:
df.head()

Unnamed: 0,account_id,amount,date,duration,loan_id,payments
0,4473,93960,970103,60,5895,1566
1,10365,260640,970104,36,7122,7240
2,5724,232560,970108,48,6173,4845
3,5591,221880,970121,60,6142,3698
4,2018,38520,970121,12,5358,3210


In [15]:
logreg = LogisticRegression(solver='liblinear', max_iter=1000, class_weight='balanced', C=0.25)
logreg.fit(train_data, train_status)

predictions = logreg.predict(df)

print(predictions)

[ 1  1  1  1  1  1  1 -1  1  1  1 -1 -1  1 -1 -1 -1  1 -1  1  1  1  1  1
  1  1  1  1 -1  1  1  1  1  1  1  1  1 -1  1  1  1  1 -1  1 -1  1 -1  1
  1  1 -1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1  1  1  1  1 -1  1  1  1
  1  1  1  1 -1  1 -1  1 -1 -1  1  1 -1  1  1  1  1  1  1  1  1  1 -1 -1
 -1 -1  1 -1 -1  1  1  1  1 -1  1  1 -1  1  1  1  1  1  1 -1  1  1  1 -1
  1  1 -1 -1 -1 -1 -1  1  1  1 -1 -1  1 -1  1 -1  1 -1 -1 -1  1  1  1 -1
  1  1  1 -1  1  1 -1  1  1  1 -1  1  1  1  1  1 -1 -1  1 -1  1  1 -1  1
 -1 -1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1
  1  1  1  1  1  1  1  1 -1  1  1 -1  1  1 -1  1  1 -1  1  1 -1 -1  1  1
  1  1  1  1  1  1  1 -1 -1 -1 -1  1  1  1  1  1  1  1  1  1  1 -1  1  1
  1  1 -1  1  1  1  1 -1  1  1  1  1  1  1  1  1  1  1 -1  1  1  1  1  1
 -1  1  1  1 -1 -1 -1  1  1  1  1  1 -1 -1  1 -1  1  1  1  1  1  1  1  1
  1 -1  1  1  1  1  1 -1  1  1  1  1 -1 -1 -1  1  1 -1  1 -1  1  1  1 -1
 -1 -1 -1  1  1 -1 -1  1  1  1 -1  1  1 -1  1 -1  1

In [24]:
df_final = df['account_id']
df_final = pd.DataFrame({"Id" : df['loan_id'], "Predicted" : predictions})

df_final.to_csv('test.csv', index=False)
print(type(predictions))

<class 'numpy.ndarray'>


# Data Preprocessing

## Cleaning data

### Accounts details data

In [78]:
df_accounts = pd.read_csv('datasets/account.csv', sep=';')

print('Frequency unique values:', df_accounts['frequency'].unique())

Frequency unique values: ['monthly issuance' 'issuance after transaction' 'weekly issuance']


In [79]:
le = LabelEncoder()

# Encoding frequency labels - ['monthly issuance' 'issuance after transaction' 'weekly issuance'] => [1, 0, 2]
df_accounts['frequency'] = le.fit_transform(df_accounts['frequency'])

In [80]:
# Formatting date column

df_accounts['date'] = '19' + df_accounts['date'].astype('str')
df_accounts['date'] = pd.to_datetime(df_accounts.date, format="%Y%m%d")

df_accounts.to_csv('clean_datasets/account.csv', index=False)
df_accounts.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,1,1993-01-01
1,3818,74,1,1993-01-01
2,704,55,1,1993-01-01
3,2378,16,1,1993-01-01
4,2632,24,1,1993-01-02


### Client details

**birth_number** 
- YYMMDD format for Men
- YYMM+50DD format for Women

In [81]:
df_clients = pd.read_csv('datasets/client.csv', sep=';')
df_clients.head()

Unnamed: 0,client_id,birth_number,district_id
0,1,706213,18
1,2,450204,1
2,3,406009,1
3,4,561201,5
4,5,605703,5


#### Create column for client's sex and reformat birth date 

**sex**
- 0 - Woman
- 1 - Man

In [82]:
df_clients['sex'] = df_clients['birth_number'].apply(lambda bn: 0 if ((bn/100)%100) >= 51 else 1)

df_clients['birth_date'] = df_clients['birth_number'].apply(lambda bn: pd.to_datetime("19" + str(bn - 5000), format="%Y%m%d") if ((bn/100)%100) >= 51 else pd.to_datetime("19" + str(bn)))

df_clients = df_clients.drop(columns=['birth_number'])
df_clients.to_csv('clean_datasets/client.csv', index=False)
df_clients.head()

Unnamed: 0,client_id,district_id,sex,birth_date
0,1,18,0,1970-12-13
1,2,1,1,1945-02-04
2,3,1,0,1940-10-09
3,4,5,1,1956-12-01
4,5,5,0,1960-07-03


### Disposition data

In [91]:
df_disp = pd.read_csv('datasets/disp.csv', sep=';')
df_disp.head()

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT


In [86]:
print('All unique Disposition types: ', df_disp['type'].unique())

All unique Disposition types:  ['OWNER' 'DISPONENT']


#### Label encoding 'type' column

- 0 - OWNER
- 1 - DISPONENT

In [92]:
df_disp['type'] = df_disp['type'].apply(lambda x: 0 if x == "OWNER" else 1)
df_disp.head()

OWNER


Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,0
1,2,2,2,0
2,3,3,2,1
3,4,4,3,0
4,5,5,3,1
