# Data Analysis 
## Link to data: https://www.kaggle.com/fedesoriano/company-bankruptcy-prediction

In [None]:
# Import packages
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pprint import pprint
from pickle import dump
from random import sample as r_sample

# Pandas df print formating
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 75)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (10, 5)
plt.style.use('fivethirtyeight')

## Load data

In [None]:
raw_data = pd.read_csv('raw_data.csv')

## View and explore data

In [None]:
pprint(raw_data.columns.values.tolist())

### Check for nan values

In [None]:
raw_data.isnull().values.any()

### Check max values

In [None]:
raw_data.max()

### Check min values

In [None]:
raw_data.min()

### Plot hist of target values of bankrupcty

In [None]:
dist_y_plot = raw_data["Bankrupt?"].value_counts().plot(kind='bar',rot=0)
plt.title("Spread of target values")
plt.ylabel("Count")
plt.xlabel("0=no and 1=yes")
plt.show(dist_y_plot)

## Split into Train and Val

In [None]:
col_names = raw_data.columns
X, y = pd.DataFrame(raw_data.iloc[:,1:]), pd.DataFrame(raw_data.iloc[:,0])
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=2)
print("Number of train samples: {}".format(len(y_train)))
print("Number of test samples: {}".format(len(y_test)))
print("X_train shape: {}".format(X_train.shape))
print("y_train shape: {}".format(y_train.shape))
print("X_test shape: {}".format(X_test.shape))
print("X_test shape: {}".format(y_test.shape))

## Scale input data 

In [None]:
# Declare scaler object
scaler = MinMaxScaler()

# Scale w.r.t train data 
scaler.fit(X_train)

# Scale train 
X_train = scaler.transform(X_train) 

# Scale test 
X_test = scaler.transform(X_test) 

# save the scaler
dump(scaler, open('scaler.pkl', 'wb'))

# load the scaler
# scaler = load(open('scaler.pkl', 'rb'))

## Balance train data 

### Intial test for sanity 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, y_train.values[:,0])
  
predictions = lr.predict(X_test)
  
# print classification report
print(classification_report(y_test.values[:,0], predictions))

## Apply SMOTE to balance data

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train.values == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train.values == 0)))
  
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train.values)
  
print('After OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train.shape))
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train == 0)))

### Post test for sanity 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# logistic regression object
lr = LogisticRegression()
  
# train the model on train set
lr.fit(X_train, y_train)
  
predictions = lr.predict(X_test)
  
# print classification report
print(classification_report(y_test.values[:,0], predictions))

## Save new data 

In [None]:
pd.DataFrame(X_train,columns=col_names[1:]).to_csv("X_train.csv",index=False)
pd.DataFrame(X_test,columns=col_names[1:]).to_csv("X_test.csv",index=False)
pd.DataFrame(y_train,columns=[col_names[0]]).to_csv("y_train.csv",index=False)
pd.DataFrame(y_test,columns=[col_names[0]]).to_csv("y_test.csv",index=False)