# Import packages

In [None]:
# import packages 
import tabula
import pdfplumber
import xgboost
import pickle
import matplotlib
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load pdf by dividing x and y seperately and combine first x and y as df

In [None]:
# define pdf inputs
x_pages = '1-74'
start_index = 0
x_pages_end = 74
y_pages_end = 148
pdf_path = '/home/ubuntu/2.AM/ds-challenge-1/data.pdf'
x_columns = ['Sex', 'Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']

In [None]:
# load pdf by pages, separated x(other variables) and y(age)
x_pdf = tabula.read_pdf(pdf_path, pages = x_pages, encoding = 'utf-8',  multiple_tables = True)
y_pdf = pdfplumber.open(pdf_path).pages[x_pages_end: y_pages_end]

In [None]:
# combine first x variables table and age label
df = x_pdf[start_index].copy()
df.columns = x_columns

age = y_pdf[start_index].extract_text().split()[2: -2]
df['Age'] = age

# Format checking for remain pages

In [None]:
# separate x_pdf into good format(correct_pages) and bad format(incorrect_pages)
correct_pages = []
incorrect_pages = []

for i in range(1, 74):
    if x_pdf[i].isnull().any().sum() == 0:
        correct_pages.append(i)
    else:
        incorrect_pages.append(i)

# check if y_pdf meet same length of x_pdf for each page
mis_match_pages = []

for i in range(1, 74):
    len_x = x_pdf[i].shape[0]
    len_y = len(y_pdf[i].extract_text().split()) - 3
    if len_x != len_y:
        mis_match_pages.append(i)
        
# delete mis-match pages in correct_pages
correct_pages = list(set(correct_pages) - set(mis_match_pages))

# Adjust incorrect pages

In [None]:
# deal with page 17
x_pdf[17][2] = x_pdf[17][2].fillna(0).astype(float) + x_pdf[17][3].fillna(0).astype(float)
del x_pdf[17][3]
x_pdf[17].columns = [0,1,2,3,4,5,6,7]

# deal with page 30, 46, 50, 58 
x_pdf[30].at[8, 3] =  x_pdf[30][2][8].split()[2]
x_pdf[46].at[50, 3] =  x_pdf[46][2][50].split()[2]
x_pdf[50].at[8, 3] =  x_pdf[50][2][8].split()[2]
x_pdf[58].at[33, 3] =  x_pdf[58][2][33].split()[2]

# deal with 2, 3 columns mixed
mis_23 = [9, 14]
for i in mis_23:
    x_23 = x_pdf[i][2].map(lambda x: x.split())
    x_2 = [item[0] for item in x_23]
    x_3 = [item[1] for item in x_23]
    x_pdf[i][2] = x_2
    x_pdf[i][3] = x_3

# deal with 3, 4 columns mixed
mis_34 = [8, 69]
for i in mis_34:
    x_34 = x_pdf[i][3].map(lambda x: x.split())
    x_3 = [item[0] for item in x_34]
    x_4 = [item[1] for item in x_34]
    x_pdf[i][3] = x_3
    x_pdf[i][4] = x_4

# deal with 2, 3, 4 columns mixed    
mis_234 = [30, 50, 58]  
for i in mis_234:
    x_pdf[i][4] = x_pdf[i][3]
    x_23 = x_pdf[i][2].map(lambda x: x.split())
    x_2 = [item[0] for item in x_23]
    x_3 = [item[1] for item in x_23]
    x_pdf[i][2] = x_2
    x_pdf[i][3] = x_3

# deal with 3, 4 and 6, 7 columns mixed
mis_34_67 = [12, 67]
for i in mis_34_67:
    x_34 = x_pdf[i][3].map(lambda x: x.split())
    x_3 = [item[0] for item in x_34]
    x_4 = [item[1] for item in x_34]
    x_pdf[i][3] = x_3
    x_pdf[i][4] = x_4
    x_67 = x_pdf[i][6].map(lambda x: x.split())
    x_6 = [item[0] for item in x_67]
    x_7 = [item[1] for item in x_67]
    x_pdf[i][6] = x_6
    x_pdf[i][7] = x_7
    
# deal with 2, 4 and 6, 7 columns mixed
mis_24_67 = [46]
for i in mis_24_67:
    x_pdf[i][4] = x_pdf[i][3]
    x_23 = x_pdf[i][2].map(lambda x: x.split())
    x_2 = [item[0] for item in x_23]
    x_3 = [item[1] for item in x_23]
    x_pdf[i][2] = x_2
    x_pdf[i][3] = x_3
    x_67 = x_pdf[i][6].map(lambda x: x.split())
    x_6 = [item[0] for item in x_67]
    x_7 = [item[1] for item in x_67]
    x_pdf[i][6] = x_6
    x_pdf[i][7] = x_7
    
# deal with page 51
mis_34_56 = [51]
for i in mis_34_56:
    x_34 = x_pdf[i][3].map(lambda x: x.split())
    x_3 = [item[0] for item in x_34]
    x_4 = [item[1] for item in x_34]
    x_pdf[i][3] = x_3
    x_pdf[i][4] = x_4
    x_56 = x_pdf[51][5].map(lambda x: x.split())
    x_5 = [item[0] for item in x_56]
    x_6 = [item[1] for item in x_56]
    x_pdf[i][5] = x_5
    x_pdf[i][6] = x_6

x_pdf[51].at[4, 4] = -999
x_pdf[51].at[4, 6] = -999
x_pdf[51].at[4, 7] = -999

# deal with page 20, 45
x_pdf[20] = x_pdf[20].drop([43]).reset_index(drop = True)
x_pdf[45] = x_pdf[45].drop([45]).reset_index(drop = True)

# Append correct pages to df

In [None]:
# loop correct pages by combining x and y, then append to df
for i in correct_pages:
    sub_df = x_pdf[i].copy()
    sub_df.columns = x_columns
    age = y_pdf[i].extract_text().split()[1: -2]
    sub_df['Age'] = age
    df = df.append(sub_df, ignore_index = True)

# Append adjusted incorrect pages to df

In [None]:
# loop incorrect pages by combining x and y, then append to df
for i in incorrect_pages:
    sub_df = x_pdf[i].copy()
    sub_df.columns = x_columns
    age = y_pdf[i].extract_text().split()[1: -2]
    sub_df['Age'] = age
    df = df.append(sub_df, ignore_index = True)

# Data cleansing and feature engineering

In [None]:
# adjust data type of columns 
# adjust float columns
float_cols = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']

for i in float_cols:
    df[i] = df[i].astype(float)

# convert age as int
df['Age'] = df['Age'].astype(int)

# Dummy for sex
sex_dum = pd.get_dummies(df['Sex'], prefix = 'Sex')
df = pd.concat([df , sex_dum], axis = 1)
del df['Sex']

In [None]:
# Add error feature
df['Weight Error'] = df['Weight'] - df['Shell Weight'] - df['Shucked Weight'] - df['Viscera Weight']

# generate interaction features
binary_cols = ['Sex_F', 'Sex_I', 'Sex_M']
continuous_cols = ['Length', 'Diameter', 'Height', 'Weight', 'Shucked Weight', 'Viscera Weight', 'Shell Weight']

for i in binary_cols:
    x1 = df[i]
    for j in continuous_cols:
        x2 = df[j]
        df[i + '_x_' + j] = x1*x2

for i in continuous_cols:
    x1 = df[i]
    for j in continuous_cols:
        if i != j:
            x2 = df[j]
            df[i + '_x_' + j] = x1*x2

In [None]:
# drop outliers and fill missing
df = df[(df['Height'] < 1)&(df['Height'] > 0.01)&(df['Weight'] != -999)&
        (df['Shucked Weight'] < df['Weight'])&(df['Shell Weight'] < df['Weight'])].fillna(0)

# Train test split

In [None]:
# train test split
Y = df['Age'].values
X = df[df.columns.drop('Age')].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 32)

# Modeling and performance

In [None]:
# BayesianRidge
BR = linear_model.BayesianRidge(n_iter = 500)
# Ridge
Ridge = linear_model.RidgeCV(alphas = [1, 0.1, 0.01, 0.001], cv = 5)
# Lasso
Lasso = linear_model.LassoCV(alphas = [1, 0.1, 0.01, 0.001], cv = 5)
# XGBoost
XGB = xgboost.XGBRegressor(colsample_bytree = 0.3,
                           eval_metric = 'rmse',
                           gamma = 0,                 
                           learning_rate = 0.1,
                           max_depth = 3,
                           min_child_weight = 1,
                           n_estimators = 30,
                           reg_alpha = 0.1,
                           reg_lambda = 0.6,
                           subsample = 0.5,
                           seed = 32)
# fit models
BR.fit(X_train, Y_train)
Ridge.fit(X_train, Y_train)
Lasso.fit(X_train, Y_train)
XGB.fit(X_train, Y_train)

In [None]:
# predict training and test data
BR_train_pred = BR.predict(X_train)
BR_test_pred = BR.predict(X_test)
Ridge_train_pred = Ridge.predict(X_train)
Ridge_test_pred = Ridge.predict(X_test)
Lasso_train_pred = Lasso.predict(X_train)
Lasso_test_pred = Lasso.predict(X_test)
XGB_train_pred = XGB.predict(X_train)
XGB_test_pred = XGB.predict(X_test)
# ensemble by average results
train_pred = np.round((BR_train_pred + Lasso_train_pred + XGB_train_pred + Ridge_train_pred)/4)
test_pred = np.round((BR_test_pred + Lasso_test_pred + XGB_test_pred + Ridge_test_pred)/4)

In [None]:
# evaluate stats
train_rmse = np.sqrt(metrics.mean_squared_error(Y_train, train_pred))
test_rmse = np.sqrt(metrics.mean_squared_error(Y_test, test_pred))
train_mean_squared_error = mean_squared_error(Y_train, train_pred)
test_mean_squared_error = mean_squared_error(Y_test, test_pred)
train_mean_absolute_error = mean_absolute_error(Y_train, train_pred)
test_mean_absolute_error = mean_absolute_error(Y_test, test_pred)
train_r2 = r2_score(Y_train, train_pred)
test_r2 = r2_score(Y_test, test_pred)

# Check residuals

In [None]:
# predict all records
BR_pred = BR.predict(X)
Ridge_pred = Ridge.predict(X)
Lasso_pred = Lasso.predict(X)
XGB_pred = XGB.predict(X)

train_pred = np.round((BR_pred + Lasso_pred + XGB_pred + Ridge_pred) / 4)
df['pred'] = train_pred
df['error'] = df['Age'] - df['pred']

#df.plot.scatter(x = 'Height', y = 'error')
#df.plot.scatter(x = 'Weight', y = 'error')

# Save models

In [None]:
# save models to pkl
regs = {'BR': BR, 'Ridge': Ridge, 'Lasso': Lasso, 'XGB': XGB}
pickle.dump(regs, open('/home/ubuntu/2.AM/regs.pkl', 'wb'))