In [157]:
import numpy as np
import yaml
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [158]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [159]:
df = pd.read_csv("s3://datateam-ml/CVR_FSI/conversions.csv")
df2 = pd.read_csv("s3://datateam-ml/CVR_FSI/sms.csv")
df2 = df2[df2['event_type']=='sms']
data = pd.concat([df, df2])
features = yaml.safe_load(open("Attributes_yaml/features.yml"))
input_col = features['input_col']+features['target']
data = data[input_col]
data = data.reindex(columns = input_col)

## PIPELINE BUILDING

In [160]:
#important libraries
import scipy as sci
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split

In [161]:
class processing():
    
    features = yaml.safe_load(open("Attributes_yaml/features.yml"))
    input_col = features['input_col']
    num = features['num_features']
    cat = features['cat_features']
    target = features['target']
    low_cat = features['low_cat']
    
    def __init__(self, data):
        self.data = data
    def map_values(self,column = 'event_type'):
        self.data['event_type'] = self.data['event_type'].apply(lambda x: 1 if x == 'install' else 0)
        
    def fill_na(self):
        for item in self.data[processing.num]:
            self.data[item] = self.data[item].fillna(self.data[item].mean())
        for item in self.data[processing.cat]:
            self.data[item] = self.data[item].fillna(self.data[item].value_counts().index[0])
    def hash_list(self):
        self.hash_features = []
        for item in processing.cat:
            if item not in processing.low_cat:
                self.hash_features.append(item)
    def pipeline(self, hash_size):
        self.num_pipeline = Pipeline(steps= [('imputer', SimpleImputer(strategy='mean')), ('std_scaler', MinMaxScaler())])
        self.cat_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                       ('one_hot_encoding', OneHotEncoder(handle_unknown = "ignore", sparse = False))])
        self.hash_pipeline = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
                                  ('hasher', FeatureHasher(n_features=hash_size, input_type='string'))])
        
    
    def build_pipe(self, hash_size = 0, test_size = 0): 
        self.fill_na()
        self.map_values()
        self.data.drop(['msisdn'],axis=1,inplace=True)
        self.hash_list()
        self.pipeline(hash_size)
        
        self.full_pipeline = ColumnTransformer(
        transformers=[
            ('num', self.num_pipeline, processing.num),
            ('cat', self.cat_pipeline, processing.low_cat),
            ('hash', self.hash_pipeline, self.hash_features)
        ])
        
        self.X = self.data.drop(processing.target, axis=1)
        self.y = self.data[processing.target].copy()
        
        self.full_pipeline.fit(self.X)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, stratify = self.y)
        
        self.X_train = self.full_pipeline.transform(self.X_train)
        self.X_test = self.full_pipeline.transform(self.X_test)
        
        print(self.X_train.shape)
        return self.X, self.y, self.X_train, self.X_test, self.y_train, self.y_test, self.full_pipeline

In [162]:
processed = processing(data)

In [163]:
X, y, X_train, X_test, y_train, y_test, full_pipeline = processed.build_pipe(hash_size = 48, test_size = 0.2)

(212245, 65)


## BUILDING THE MODEL USING PYTORCH

In [164]:
#CONVERT TRAIN AND TEST DATA TO TENSORS

EPOCHS = 35
BATCH_SIZE = 64
LEARNING_RATE = 0.001

class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)


train_data = trainData(torch.FloatTensor(X_train), 
                       torch.FloatTensor(y_train.values))
## test data    
class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = testData(torch.FloatTensor(X_test))

In [165]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [166]:
class binaryClassification(nn.Module):
    def __init__(self):
        super(binaryClassification, self).__init__()
        # Number of input features is 62.
        self.layer_1 = nn.Linear(65, 128) 
        self.layer_2 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [115]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [167]:
model = binaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

binaryClassification(
  (layer_1): Linear(in_features=65, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [168]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [169]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch)#.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch)#.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.20715 | Acc: 93.393
Epoch 002: | Loss: 0.18059 | Acc: 94.249
Epoch 003: | Loss: 0.17802 | Acc: 94.246
Epoch 004: | Loss: 0.17686 | Acc: 94.256
Epoch 005: | Loss: 0.17587 | Acc: 94.259
Epoch 006: | Loss: 0.17506 | Acc: 94.272
Epoch 007: | Loss: 0.17456 | Acc: 94.257
Epoch 008: | Loss: 0.17392 | Acc: 94.260
Epoch 009: | Loss: 0.17305 | Acc: 94.276
Epoch 010: | Loss: 0.17242 | Acc: 94.287
Epoch 011: | Loss: 0.17174 | Acc: 94.277
Epoch 012: | Loss: 0.17134 | Acc: 94.287
Epoch 013: | Loss: 0.17087 | Acc: 94.292
Epoch 014: | Loss: 0.17030 | Acc: 94.312
Epoch 015: | Loss: 0.16983 | Acc: 94.322
Epoch 016: | Loss: 0.16961 | Acc: 94.316
Epoch 017: | Loss: 0.16947 | Acc: 94.323
Epoch 018: | Loss: 0.16869 | Acc: 94.341
Epoch 019: | Loss: 0.16867 | Acc: 94.323
Epoch 020: | Loss: 0.16820 | Acc: 94.340
Epoch 021: | Loss: 0.16778 | Acc: 94.352
Epoch 022: | Loss: 0.16702 | Acc: 94.356
Epoch 023: | Loss: 0.16707 | Acc: 94.361
Epoch 024: | Loss: 0.16685 | Acc: 94.341
Epoch 025: | Los

In [170]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [171]:
confusion_matrix(y_test, y_pred_list)
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     50031
           1       0.45      0.03      0.05      3031

   micro avg       0.94      0.94      0.94     53062
   macro avg       0.70      0.51      0.51     53062
weighted avg       0.92      0.94      0.92     53062



## SAVING AND LOADING MODEL

In [172]:
print("Our model: \n\n", model, '\n')
print("The state dict keys: \n\n", model.state_dict().keys())

Our model: 

 binaryClassification(
  (layer_1): Linear(in_features=65, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
) 

The state dict keys: 

 odict_keys(['layer_1.weight', 'layer_1.bias', 'layer_2.weight', 'layer_2.bias', 'layer_out.weight', 'layer_out.bias', 'batchnorm1.weight', 'batchnorm1.bias', 'batchnorm1.running_mean', 'batchnorm1.running_var', 'batchnorm1.num_batches_tracked', 'batchnorm2.weight', 'batchnorm2.bias', 'batchnorm2.running_mean', 'batchnorm2.running_var', 'batchnorm2.num_batches_tracked'])


In [173]:
torch.save(model.state_dict(), '/home/ec2-user/SageMaker/FSI_SMS_TO_CONVERSION/model_and_pipeline/checkpoint.pth')

In [174]:
state_dict = torch.load('model_and_pipeline/checkpoint.pth')
print(state_dict.keys())

odict_keys(['layer_1.weight', 'layer_1.bias', 'layer_2.weight', 'layer_2.bias', 'layer_out.weight', 'layer_out.bias', 'batchnorm1.weight', 'batchnorm1.bias', 'batchnorm1.running_mean', 'batchnorm1.running_var', 'batchnorm1.num_batches_tracked', 'batchnorm2.weight', 'batchnorm2.bias', 'batchnorm2.running_mean', 'batchnorm2.running_var', 'batchnorm2.num_batches_tracked'])


In [None]:
model.load_state_dict(state_dict)

## TO AVOID INPUT AND HIDDEN LAYER ISSUES

In [175]:
checkpoint = {'input_size': 65,
              'output_size': 1,
              'hidden_layers': [each.out_features for each in model.hidden_layers],
              'state_dict': model.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

AttributeError: 'binaryClassification' object has no attribute 'hidden_layers'

In [None]:
def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = fc_model.Network(checkpoint['input_size'],
                             checkpoint['output_size'],
                             checkpoint['hidden_layers'])
    model.load_state_dict(checkpoint['state_dict'])
    
    return model

In [None]:
model = load_checkpoint('checkpoint.pth')
print(model)