# Point Colaboratory to Google Drive

In [1]:
# In the following drive folder, I save the dataset, the models and all other 
# files required for training and prediction
from google.colab import drive
drive.mount('/content/gdrive')
datapath = '/content/gdrive/My Drive/TESI/' 

Mounted at /content/gdrive


# Import

In [2]:
# Install a library not available in Colab
!pip install stop_words

Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32912 sha256=b9debe6109fc6f7ed69b284c7423e7401edde3da582a7bd77a26c0f935a6d645
  Stored in directory: /root/.cache/pip/wheels/fb/86/b2/277b10b1ce9f73ce15059bf6975d4547cc4ec3feeb651978e9
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import tqdm.notebook as tq
import time
from operator import itemgetter
import random
import datetime 
from pandas.api.types import is_numeric_dtype # used for one-hot encoding
import math
import tqdm.notebook as tq
import itertools

# Text mining
from stop_words import get_stop_words
from string import punctuation 
import re
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from wordcloud import WordCloud
from scipy import sparse
from scipy.sparse import hstack, vstack

# Sklearn utilities
# from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import KFold, ParameterGrid
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score

from collections import Counter
from imblearn.over_sampling import RandomOverSampler 
from imblearn.under_sampling import RandomUnderSampler
import pickle # to save and load sklearn models


# Sklearn models
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier


import warnings
warnings.filterwarnings("ignore")

# Neural Netowrk and Deep Learning
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import torchvision
# from torchvision import transforms



In [4]:
# # Check if the GPU is available
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(f"Training device: {device}")

# Pre-processing

### Import the dataset

In [5]:
dataset_path = datapath + 'dataset/data-1618311055681.csv'
data = pd.read_csv(dataset_path)
print("Shape of the dataset: {}".format(data.shape))

Shape of the dataset: (26950, 23)


### Merge note with workdesc

In [6]:
data.workdesc[~data.note.isnull()] = data.workdesc[~data.note.isnull()] + ' ' + data.note[~data.note.isnull()]

### Drop columns with almost all null values and useless ones

In [7]:
data.drop(['description', 'nr_fattura', 'note', 'umid', 'app_ins', 'jobregistered'], inplace=True, axis=1)

In [8]:
# Check how missing values are distributed across each variable (i.e., column)
print(f'Total number of rows: {data.shape[0]}')
print()
print('MISSING VALUES PER VARIABLE:\n')
data.apply(lambda x: sum(x.isnull())) # apply applies the function column-wise

Total number of rows: 26950

MISSING VALUES PER VARIABLE:



id                   0
resid                0
jobid                0
jobtaskdt            0
pay                  0
workdesc             0
qty                  0
jobtaskid            0
flg_trasferta        0
data_ins             0
flg_prepagato        0
custid               0
flg_straordinario    0
flg_status           0
sede                 8
tipo_update          0
changed_on           0
dtype: int64

## Filter by rows


### 1) Remove flg_status = A

In [9]:
status_a = data.flg_status == 'A'
status_a = status_a[status_a == True].index.tolist()
data = data.drop(status_a)

### 2) group by id and keep only last approved (the higher)

In [10]:
data_group = data.groupby(['id'])
da_rimuovere = []
for name, group in data_group:
    
    app = group.tipo_update == 'approvato'
    app = app[app == True].index.tolist()
    if app:
        app.pop(0)
        da_rimuovere += app
data = data.drop(da_rimuovere)

data.drop('flg_status', inplace=True, axis=1)

### 3) group by id and keep only the first cambio_commessa (the lower)

In [11]:
data_group = data.groupby(['id'])
da_rimuovere = []
for name, group in data_group:
    
    app = group.index.tolist()
    if app:
        app.pop()
        da_rimuovere += app

data = data.drop(da_rimuovere)

In [12]:
freqs = data.tipo_update.value_counts()
freqs

approvato          20604
cambio_commessa     2023
Name: tipo_update, dtype: int64

In [13]:
freqs['cambio_commessa']/ (freqs['cambio_commessa'] + freqs['approvato'])

0.08940646130728776

## Manual feature engineering

Columns to be removed: id, changed_on

In [14]:
data = data[['workdesc', 'jobid', 'jobtaskid', 'resid', 'custid', 'qty', 'sede', 'jobtaskdt', 'data_ins', 'flg_trasferta', 'pay', 'flg_prepagato', 'flg_straordinario', 'tipo_update']]

### Day

In [15]:
def date_to_days(d):
    d1 = pd.to_datetime(d, format='%Y-%m-%d')
    diff = d1 - pd.to_datetime('2020-07-01', format='%Y-%m-%d')
    return diff.days

In [16]:
data['day'] = data['jobtaskdt'].apply(lambda x: date_to_days(x))

### Delay

In [17]:
data['day1'] = data['data_ins'].apply(lambda x: date_to_days(x))
data['delay'] = data.day1 - data.day

In [18]:
data = data[['workdesc', 'jobid', 'jobtaskid', 'resid', 'custid', 'qty', 'sede', 'day', 'delay', 'flg_trasferta', 'pay', 'flg_prepagato', 'flg_straordinario', 'tipo_update']]

### Area

In [19]:
resid_df_update = pd.read_csv(datapath + 'dataset/id-area.csv')#, sep = ' ', header=None)
resid_df_update['Gruppi Utenti'] = resid_df_update['Gruppi Utenti'].apply(lambda x: x.strip())

In [20]:
update = {'Acquisti' : 'ACQ',
          'Amministrazione' : 'AMMINISTRAZIONE', 
          'Autentica' : 'AMMINISTRAZIONE',
          'Business Inteligence' : 'BI',
          'Commerciale' : 'COMMERCIALE MTK',
          'Consulenti Esterni' : 'OTHER',
          'Database Administrator' : 'DBA',
          'Developers' : 'SOA',
          'Legal' : 'AMMINISTRAZIONE',
          'Marketing' : 'MARKETING',
          'Presidio' : 'PRESIDIO',
          'Service Design' : 'DESIGN',
          'Sistemisti' : 'SISTEMISTI'}

In [21]:
mapping = resid_df_update['Gruppi Utenti'].apply(lambda x: update[x])
mapping.index = resid_df_update['ID']

In [22]:
def resid_to_area(r):
    if r in mapping.index:
        return mapping[r]
    else:
        return 'OTHER'

In [23]:
data['area'] = data['resid'].apply(lambda x: resid_to_area(x))

In [24]:
data = data[['workdesc', 'jobid', 'jobtaskid', 'resid', 'area', 'custid', 'qty', 'sede', 'day', 'delay', 'flg_trasferta', 'pay', 'flg_prepagato', 'flg_straordinario', 'tipo_update']]

## Remove duplicates

In [25]:
data = data[~data.duplicated()]

## Text cleaning

In [26]:
data = data.reset_index()
data = data.drop(['index'], inplace=False, axis=1)

In [27]:
# Define the ilst of stopwords (both english and italian) and also punctuation marks
stop_words = get_stop_words('it')
stop_words_en = get_stop_words('en')

# Create the final list
stop_words_punct_eng = stop_words + stop_words_en + list(punctuation) + ["’"] 

Compute the number of different words after removing punctuations and both italian and english stopwords

In [28]:
def clean_text_iteng_url_mail(text):
    text = re.sub("(\w+:\/\/\S+)", " ", text)
    text = re.sub("(\S*@\S*\s)", " ", text)
    text = re.sub("[\.\,\!\?\:\;\-\=\'\...\"\@\#\_!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]", " ", text)
    words = text.lower()
    words = words.split()
    noise_free_words = [word for word in words if word not in stop_words_punct_eng and not word.isdigit()] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

In [29]:
words_v_iteng_url_mail = data["workdesc"].apply(lambda s: clean_text_iteng_url_mail(s))

In [30]:
words_iteng_url_mail = set()
for i in range(len(words_v_iteng_url_mail)):
    words_iteng_url_mail.update(words_v_iteng_url_mail[i].lower().split())

In [31]:
stemmer = SnowballStemmer("italian")

def clean_text(text):
    text = re.sub("(\w+:\/\/\S+)", " ", text)
    text = re.sub("(\S*@\S*\s)", " ", text)
    text = re.sub("[\.\,\!\?\:\;\-\=\'\...\"\@\#\_!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]", " ", text)
    words = text.lower()
    words = words.split()
    # noise_free_words = [word for word in words if word not in stop_words_punct] 
    # noise_free_words = [word for word in words if word not in stop_words and not word.isdigit()] 
    noise_free_words = [word for word in words if word not in stop_words_punct_eng and not word.isdigit()] 
    noise_free_words = [stemmer.stem(w) for w in noise_free_words]
    noise_free_text = " ".join(noise_free_words) 
    # return noise_free_words
    return noise_free_text

In [32]:
data["cleaned_workdesc"] = data["workdesc"].apply(lambda s: clean_text(s))

## Hande categorical variables

In [33]:
data[['jobid','jobtaskid', 'resid', 'custid', 'sede']] = data[['jobid','jobtaskid', 'resid', 'custid', 'sede']].astype(str)
data[['flg_trasferta','pay', 'flg_prepagato', 'flg_straordinario']] = data[['flg_trasferta','pay', 'flg_prepagato', 'flg_straordinario']].astype(int)
data = data.drop(['workdesc'], inplace=False, axis=1)

In [34]:
categorical_features = [col for col in data.columns if not is_numeric_dtype(data[col]) and col != 'tipo_update' and col != 'cleaned_workdesc']

In [35]:
# In pandas we can achieve easily one-hot encoding using the 'get_dummies()' function
categorical_features = [col for col in data.columns if not is_numeric_dtype(data[col]) and col != 'tipo_update' and col != 'workdesc' and col != 'cleaned_workdesc']
data = pd.get_dummies(data, columns = categorical_features)

## Encoding Label

In [36]:
y_labels = ['approvato', 'cambio_commessa']

In [37]:
data.tipo_update = data.tipo_update.map(lambda x: 0 if x=='approvato' else 1)

In [38]:
header = pd.read_csv(datapath + 'dataset/header.csv')

In [39]:
data = data.reindex(columns = header.columns, fill_value=0)

In [40]:
data

Unnamed: 0,qty,day,delay,flg_trasferta,pay,flg_prepagato,flg_straordinario,jobid_100,jobid_107,jobid_110,jobid_118,jobid_119,jobid_123,jobid_125,jobid_126,jobid_135,jobid_157,jobid_158,jobid_173,jobid_174,jobid_175,jobid_177,jobid_178,jobid_179,jobid_187,jobid_203,jobid_209,jobid_210,jobid_227,jobid_231,jobid_2457,jobid_2488,jobid_2509,jobid_2516,jobid_2528,jobid_2534,jobid_2538,jobid_2579,jobid_2580,jobid_2598,...,custid_4340,custid_4343,custid_4361,custid_4370,custid_4375,custid_4379,custid_4391,custid_44,custid_4404,custid_4410,custid_4414,custid_4415,custid_4429,custid_4430,custid_4435,custid_4439,custid_4447,custid_4449,custid_4453,custid_4461,custid_4466,custid_4469,custid_4475,custid_4480,custid_4484,custid_48,custid_59,custid_62,custid_9,sede_1.0,sede_2.0,sede_3.0,sede_4.0,sede_5.0,sede_6.0,sede_7.0,sede_8.0,sede_9.0,cleaned_workdesc,tipo_update
0,5.5,284,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,sophos aulss7,1
1,6.0,283,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,sophos aulss7,1
2,1.5,282,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,test attiv journeys ato uk ato us analis metri...,0
3,0.5,281,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,call team client fix problemat tracker domain,0
4,0.5,279,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,analis problemat configur pardot tracker domai...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22583,1.5,77,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22584,4.0,72,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22585,8.0,69,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22586,8.0,71,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,upgrad server qlikview install component python,1


In [41]:
filter = data.cleaned_workdesc == ''
data = data[~ filter]

In [42]:
# data.to_csv(datapath + 'dataset/data9_test.csv', index=False)

# Dataset

In [43]:
# data = pd.read_csv(datapath + 'dataset/data8.csv')
# data[['jobid','jobtaskid', 'resid', 'custid', 'sede']] = data[['jobid','jobtaskid', 'resid', 'custid', 'sede']].astype(str)
# data[['flg_trasferta','pay', 'flg_prepagato', 'flg_straordinario']] = data[['flg_trasferta','pay', 'flg_prepagato', 'flg_straordinario']].astype(int)
# data = data.drop(['workdesc'], inplace=False, axis=1)
# data = data8[['cleaned_workdesc', 'tipo_update']]
data = data[~ data.cleaned_workdesc.isna()]
data

Unnamed: 0,qty,day,delay,flg_trasferta,pay,flg_prepagato,flg_straordinario,jobid_100,jobid_107,jobid_110,jobid_118,jobid_119,jobid_123,jobid_125,jobid_126,jobid_135,jobid_157,jobid_158,jobid_173,jobid_174,jobid_175,jobid_177,jobid_178,jobid_179,jobid_187,jobid_203,jobid_209,jobid_210,jobid_227,jobid_231,jobid_2457,jobid_2488,jobid_2509,jobid_2516,jobid_2528,jobid_2534,jobid_2538,jobid_2579,jobid_2580,jobid_2598,...,custid_4340,custid_4343,custid_4361,custid_4370,custid_4375,custid_4379,custid_4391,custid_44,custid_4404,custid_4410,custid_4414,custid_4415,custid_4429,custid_4430,custid_4435,custid_4439,custid_4447,custid_4449,custid_4453,custid_4461,custid_4466,custid_4469,custid_4475,custid_4480,custid_4484,custid_48,custid_59,custid_62,custid_9,sede_1.0,sede_2.0,sede_3.0,sede_4.0,sede_5.0,sede_6.0,sede_7.0,sede_8.0,sede_9.0,cleaned_workdesc,tipo_update
0,5.5,284,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,sophos aulss7,1
1,6.0,283,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,sophos aulss7,1
2,1.5,282,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,test attiv journeys ato uk ato us analis metri...,0
3,0.5,281,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,call team client fix problemat tracker domain,0
4,0.5,279,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,analis problemat configur pardot tracker domai...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22583,1.5,77,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22584,4.0,72,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22585,8.0,69,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,document fluss kettl,1
22586,8.0,71,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,upgrad server qlikview install component python,1


## Split

In [44]:
def split_data(X, y):
    # Divide the dataset into training, validation and test sets
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, 
                                                                random_state=22, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=22, stratify=y_train_val)

    print("Training Set shape: {}".format(X_train.shape))
    print("Validation Set shape: {}".format(X_val.shape))
    print("Training + Validation Set shape: {}".format(X_train_val.shape))
    print("Test Set shape: {}".format(X_test.shape))

    return X_train, y_train, X_val, X_test, y_val, y_test, X_train_val, y_train_val

## BOW

In [45]:
def vectorizer(X_train, X_val, X_test, max_features=1000, n_min=1, n_max=1, vect_type='cv'):
    if vect_type == 'cv':
        vect = CountVectorizer(
            max_features = max_features,
            ngram_range = (n_min, n_max),     # (1,1) means only unigrams, (1,2) means unigrams and bigrams, (2,2) only bigrams
            )
        
    if vect_type == 'tfidf':
         vect = TfidfVectorizer(
            max_features = max_features,
            ngram_range = (n_min, n_max),     # (1,1) means only unigrams, (1,2) means unigrams and bigrams, (2,2) only bigrams
            )

    vect_train = vect.fit_transform(X_train['cleaned_workdesc'])
    vect_train = pd.DataFrame(vect_train.toarray(), columns=vect.get_feature_names())
    X_train_v = pd.concat([X_train.reset_index().drop(['index'], inplace=False, axis=1), vect_train], axis=1)
    X_train_v.drop(['cleaned_workdesc'], inplace=True, axis=1)

    vect_val = vect.transform(X_val['cleaned_workdesc'])
    vect_val = pd.DataFrame(vect_val.toarray(), columns=vect.get_feature_names())
    X_val_v = pd.concat([X_val.reset_index().drop(['index'], inplace=False, axis=1), vect_val], axis=1)
    X_val_v.drop(['cleaned_workdesc'], inplace=True, axis=1)
    
    vect_test = vect.transform(X_test['cleaned_workdesc'])
    vect_test = pd.DataFrame(vect_test.toarray(), columns=vect.get_feature_names())
    X_test_v = pd.concat([X_test.reset_index().drop(['index'], inplace=False, axis=1), vect_test], axis=1)
    X_test_v.drop(['cleaned_workdesc'], inplace=True, axis=1)
    
    return X_train_v, X_val_v, X_test_v, vect

## Under/Oversampling

In [46]:
def resampling(X_train_v, y_train, approach='none'):
    # summarize class distribution
    # print(Counter(y_train))
    
    if approach == 'over':
        # define oversampling strategy
        resample = RandomOverSampler(sampling_strategy='minority', random_state=1)
    
    if approach == 'under':
        # define undersample strategy
        resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)

    if approach == 'mid':
        resample = RandomOverSampler(sampling_strategy=0.4212, random_state=1)
        # resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
        X_train_v_r, y_train_r = resample.fit_resample(X_train_v, y_train)
        resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
        X_train_v_r, y_train_r = resample.fit_resample(X_train_v_r, y_train_r)
        return X_train_v_r, y_train_r

    if approach == 'none':
        return X_train_v, y_train

    # fit and apply the transform
    X_train_v_r, y_train_r = resample.fit_resample(X_train_v, y_train)
    # summarize class distribution
    # print(Counter(y_train_r)) 

    return X_train_v_r, y_train_r

# Model tuning 

The procedure will be the following for each model: train the models using some hyper-parameters combinations and evaluating them in the validation set. Seelct the configuration leading to the higher validation accuracy, and re-train the model with this combination in the merged training-validation set. Finally, evaluate it just once in the test set.

### Auxiliary functions

Let's write some auxiliary functions

In [47]:
def metrics(true_positive, true_negative, false_positive, false_negative):
  
    accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_negative + false_positive)
    precision = true_positive / (true_positive + false_positive)
    sensitivity = true_positive / (true_positive + false_negative)
    specificity = true_negative / (true_negative + false_positive)
    weighted_accuracy = (sensitivity + specificity) / 2
    mcc_numerator = (true_positive * true_negative) - (false_positive * false_negative)
    mcc_denominator_squared = (true_positive + false_positive) * (true_positive + false_negative) * (true_negative + false_positive) * (true_negative + false_negative)
    mcc = mcc_numerator / math.sqrt(mcc_denominator_squared)
    f1_score = 2 * (precision * sensitivity) / (precision + sensitivity)
    
    return {
            "Accuracy": accuracy,
            "Weighted accuracy": weighted_accuracy,
            "Precision": precision,
            "Sensitivity": sensitivity,
            "Specificity": specificity,
            "MCC": mcc,
            "F-score": f1_score
            }


In [48]:
def metrics_results(true, pred):

    true_positive = np.sum(np.logical_and(true == 1, pred == 1))
     
    # remove from actual positive the TP to get FN 
    false_negative = np.sum(true) - true_positive

    # remove from predicted positive the TP to get FP
    false_positive = np.sum(pred) - true_positive

    true_negative = np.sum(np.logical_and(true == 0, pred == 0))
    
    conf_matrix = pd.DataFrame(np.array([[true_positive, false_positive], [false_negative, true_negative]]), 
                               index=['Positive pred', 'Negative pred'], 
                               columns=['Actual positive', 'Actual negative'])

    return conf_matrix, metrics(true_positive, true_negative, false_positive, false_negative)  

In [49]:
def accuracy(y_pred, y_true):
    '''
    input y_pred: ndarray of shape (N,)
    input y_true: ndarray of shape (N,)
    '''
    return (1.0 * (y_pred == y_true)).mean()

In [50]:
X_train, y_train, X_val, X_test, y_val, y_test, X_train_val, y_train_val = split_data(data.iloc[:,:-1], data.tipo_update)

Training Set shape: (13083, 1213)
Validation Set shape: (4361, 1213)
Training + Validation Set shape: (17444, 1213)
Test Set shape: (4361, 1213)


## Gradient Boosting

In [51]:
max_features_v = [250, 500, 1000, 1900, 2703, 3725, 4935]
n_min = 1 # , 2, 3]
n_max_v = [1, 2, 3]
vect_type_v = ['cv', 'tfidf']
approach_v = ['none', 'over', 'under', 'mid']
n_estimator_v = [1, 10, 50, 100]
max_depth_v = [10, 50, 100]

super_acc = 0
super_bacc = 0

results = []

# RANDOM SEARCH
random.seed(22)
pbar = tq.tqdm(total = 20, desc = 'Random search progress')
random_coarse_grid = list(itertools.product(max_features_v, n_max_v, vect_type_v, approach_v, n_estimator_v, max_depth_v))
for (max_features, n_max, vect_type, approach, n_estimator, max_depth) in random.sample(random_coarse_grid, 20):
    # sampling combinations from itertools.product, we are sure no combination is repeated
    print(f'max_features = {max_features}, n_max = {n_max}, vect_type = {vect_type},' 
          f' approach = {approach}, n_estimator = {n_estimator}, max_depth = {max_depth}')

    X_train_v, X_val_v, X_test_v, vect = vectorizer(X_train, X_val, X_test, max_features, n_min, n_max, vect_type)
    X_train_v_r, y_train_r = resampling(X_train_v, y_train, approach)

    model_gb = GradientBoostingClassifier(n_estimators = n_estimator, max_depth = max_depth, random_state = 22)
    model_gb.fit(X_train_v_r, y_train_r)
    y_train_pred = model_gb.predict(X_train_v_r)
    y_valid_pred = model_gb.predict(X_val_v)

    acc = accuracy(y_train_r, y_train_pred)
    bacc = balanced_accuracy_score(y_train_r, y_train_pred)
    acc_v = accuracy(y_val, y_valid_pred)
    bacc_v = balanced_accuracy_score(y_val, y_valid_pred)

    if super_acc < acc_v:
        optimal_hyp_acc = (max_features, n_max, vect_type, approach, n_estimator, max_depth)
        super_acc = acc_v

    if super_bacc < bacc_v:
        optimal_hyp_bacc = (max_features, n_max, vect_type, approach, n_estimator, max_depth)
        super_bacc = bacc_v

    print(f'Training acc = {acc:.3f}, Training b_acc = {bacc:.3f}, Validation acc = {acc_v:.3f}, Validation b_acc = {bacc_v:.3f}')
    print()

    results.append((np.round(acc_v, 3), np.round(bacc_v, 3), f'max_features = {max_features}, n_max = {n_max}, vect_type = {vect_type},' 
          f' approach = {approach}, n_estimator = {n_estimator}, max_depth = {max_depth}'))
    
    pbar.update(1)

print('RESULTS SORTED BY ACCURACY')
for r in sorted(results, key=lambda tup: tup[0], reverse = True):
    print(f'Val_acc = {r[0]:.3f}, Val_bal_acc = {r[1]:.3f} -> {r[2]}')
print()

print('RESULTS SORTED BY BALANCED ACCURACY')
for r in sorted(results, key=lambda tup: tup[1], reverse = True):
    print(f'Val_acc = {r[0]:.3f}, Val_bal_acc = {r[1]:.3f} -> {r[2]}')

Random search progress:   0%|          | 0/20 [00:00<?, ?it/s]

max_features = 4935, n_max = 3, vect_type = cv, approach = mid, n_estimator = 50, max_depth = 10
Training acc = 0.946, Training b_acc = 0.946, Validation acc = 0.899, Validation b_acc = 0.831

max_features = 4935, n_max = 2, vect_type = cv, approach = under, n_estimator = 100, max_depth = 50
Training acc = 1.000, Training b_acc = 1.000, Validation acc = 0.814, Validation b_acc = 0.822

max_features = 250, n_max = 3, vect_type = tfidf, approach = mid, n_estimator = 100, max_depth = 100
Training acc = 1.000, Training b_acc = 1.000, Validation acc = 0.895, Validation b_acc = 0.780

max_features = 500, n_max = 3, vect_type = cv, approach = over, n_estimator = 10, max_depth = 50
Training acc = 0.997, Training b_acc = 0.997, Validation acc = 0.924, Validation b_acc = 0.796

max_features = 250, n_max = 1, vect_type = tfidf, approach = none, n_estimator = 1, max_depth = 10
Training acc = 0.908, Training b_acc = 0.500, Validation acc = 0.908, Validation b_acc = 0.500

max_features = 2703, n_max

In [52]:
optimal_hyp_acc

(2703, 2, 'cv', 'none', 50, 50)

In [53]:
optimal_hyp_bacc

(500, 2, 'tfidf', 'mid', 50, 10)

# Final models

In [54]:
X_train, y_train = data.iloc[:,:-1], data.tipo_update

In [55]:
def vectorizer_train(X_train, max_features=1000, n_min=1, n_max=1, vect_type='cv'):
    if vect_type == 'cv':
        vect = CountVectorizer(
            max_features = max_features,
            ngram_range = (n_min, n_max),     # (1,1) means only unigrams, (1,2) means unigrams and bigrams, (2,2) only bigrams
            )
        
    if vect_type == 'tfidf':
         vect = TfidfVectorizer(
            max_features = max_features,
            ngram_range = (n_min, n_max),     # (1,1) means only unigrams, (1,2) means unigrams and bigrams, (2,2) only bigrams
            )

    vect_train = vect.fit_transform(X_train['cleaned_workdesc'])
    vect_train = pd.DataFrame(vect_train.toarray(), columns=vect.get_feature_names())
    X_train_v = pd.concat([X_train.reset_index().drop(['index'], inplace=False, axis=1), vect_train], axis=1)
    X_train_v.drop(['cleaned_workdesc'], inplace=True, axis=1)
    
    return X_train_v, vect

In [56]:
def resampling_train(X_train_v, y_train, approach='none'):
    # summarize class distribution
    # print(Counter(y_train))
    
    if approach == 'over':
        # define oversampling strategy
        resample = RandomOverSampler(sampling_strategy='minority', random_state=1)
    
    if approach == 'under':
        # define undersample strategy
        resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)

    if approach == 'mid':
        resample = RandomOverSampler(sampling_strategy=0.4212, random_state=1)
        # resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
        X_train_v_r, y_train_r = resample.fit_resample(X_train_v, y_train)
        resample = RandomUnderSampler(sampling_strategy='majority', random_state=1)
        X_train_v_r, y_train_r = resample.fit_resample(X_train_v_r, y_train_r)
        return X_train_v_r, y_train_r

    if approach == 'none':
        return X_train_v, y_train

    # fit and apply the transform
    X_train_v_r, y_train_r = resample.fit_resample(X_train_v, y_train)
    # summarize class distribution
    # print(Counter(y_train_r)) 

    return X_train_v_r, y_train_r

#### Final model - best accuracy

In [57]:
max_features, n_max, vect_type, approach, n_estimator, max_depth = optimal_hyp_acc

In [58]:
X_train_v, vect = vectorizer_train(X_train, max_features, 1, n_max, vect_type)

In [59]:
# Save vectorizer
pickle.dump(vect, open(datapath + 'models/vect_best_acc.sav', 'wb'))

In [60]:
X_train_v_r, y_train_r = resampling(X_train_v, y_train, approach)
model = GradientBoostingClassifier(n_estimators = n_estimator, max_depth = max_depth, random_state = 22)
start_time = time.time()
model.fit(X_train_v_r, y_train_r)
run_time_fit = (time.time() - start_time)

In [61]:
# save the model to disk
pickle.dump(model, open(datapath + 'models/model_best_acc.sav', 'wb'))

In [62]:
start_time = time.time()
y_train_pred = model.predict(X_train_v)
run_time_predict = (time.time() - start_time)

In [63]:
print(f'Fitting time: {run_time_fit}')
print(f'Predicting time: {run_time_predict}')

Fitting time: 636.482958316803
Predicting time: 1.1090114116668701


In [64]:
conf_matrix_rf, metric_results_rf = metrics_results(y_train, y_train_pred)
print('CLASSIFICATION METRICS')
for k in metric_results_rf:
    print(f'{k}: {metric_results_rf[k]:.3}')

CLASSIFICATION METRICS
Accuracy: 1.0
Weighted accuracy: 0.999
Precision: 1.0
Sensitivity: 0.998
Specificity: 1.0
MCC: 0.999
F-score: 0.999


#### Final model - best balanced accuracy

In [65]:
max_features, n_max, vect_type, approach, n_estimator, max_depth = optimal_hyp_bacc

In [66]:
X_train_v, vect = vectorizer_train(X_train, max_features, 1, n_max, vect_type)

In [67]:
# Save vectorizer
pickle.dump(vect, open(datapath + 'models/vect_best_bacc.sav', 'wb'))

In [68]:
X_train_v_r, y_train_r = resampling(X_train_v, y_train, approach)
model = GradientBoostingClassifier(n_estimators = n_estimator, max_depth = max_depth, random_state = 22)
start_time = time.time()
model.fit(X_train_v_r, y_train_r)
run_time_fit = (time.time() - start_time)

In [69]:
# save the model to disk
pickle.dump(model, open(datapath + 'models/model_best_bacc.sav', 'wb'))

In [70]:
start_time = time.time()
y_train_pred = model.predict(X_train_v)
run_time_predict = (time.time() - start_time)

In [71]:
print(f'Fitting time: {run_time_fit}')
print(f'Predicting time: {run_time_predict}')

Fitting time: 148.850919008255
Predicting time: 0.3837394714355469


In [72]:
conf_matrix_rf, metric_results_rf = metrics_results(y_train, y_train_pred)
print('CLASSIFICATION METRICS')
for k in metric_results_rf:
    print(f'{k}: {metric_results_rf[k]:.3}')

CLASSIFICATION METRICS
Accuracy: 0.917
Weighted accuracy: 0.927
Precision: 0.528
Sensitivity: 0.939
Specificity: 0.914
MCC: 0.667
F-score: 0.676
