In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from matplotlib import style
import matplotlib.pyplot as plt
import random
import torch
from torch.autograd import Variable
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from sklearn.model_selection import GridSearchCV
style.use("ggplot")

def plot(data,title):
    plt.plot(data)
    plt.title(title,loc='center')
    
def plot_train_dev(train,dev,title):
    plt.figure(figsize=(15,10))
    plt.subplot(221)
    plot(train,"Training "+title)
    plt.subplot(222)
    plot(dev,"Dev "+title)
    plt.show()

## 1. Process Data

In [3]:
## reading clean data
data = pd.read_csv('cleaned_4.csv')
data.head()
data.describe()
data.columns

Index(['Id', 'StageName', 'Status_Reason__c', 'RecordType.Name',
       'RICE_Supported__c', 'CreatedDate', 'AccountId', 'Lead_Faculty__c',
       'Parent_Opportunity__c', 'RecordType.Name.1', 'Industry',
       'Business_Type__c', 'ParentId', 'RecordType', 'CreatedDate_month',
       'Is_External__c'],
      dtype='object')

In [4]:
## one-hot-encoding

a = pd.get_dummies(data["Status_Reason__c"], prefix='Status_Reason')
b = pd.get_dummies(data["RecordType.Name"], prefix='RecordType')
c = pd.get_dummies(data["RICE_Supported__c"], prefix='RICE_Supported')
d = pd.get_dummies(data["AccountId"], prefix='AccountId')
e = pd.get_dummies(data["Lead_Faculty__c"], prefix='Lead_Faculty')
f = pd.get_dummies(data["RecordType.Name.1"], prefix='RecordType_ind')
g = pd.get_dummies(data["Industry"], prefix='Industry')
h = pd.get_dummies(data["Business_Type__c"], prefix='Business_Type')
i = pd.get_dummies(data["RecordType"], prefix='RecordType_mixed')
j = pd.get_dummies(data["Is_External__c"], prefix='Is_External')

org = data[["StageName","Parent_Opportunity__c","ParentId"]]

scale1 = data["CreatedDate"].div(2020)
scale2 = data["CreatedDate_month"].div(12)

df = pd.concat([org,scale1,scale2,a,b,c,d,e,f,g,h,i,j], axis=1, ignore_index=True)

In [5]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2189,2190,2191,2192,2193,2194,2195,2196,2197,2198
0,0,0,0,0.999505,0.916667,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,0,1,0.999010,0.916667,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,1,0,1,0.999010,0.916667,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
3,1,0,0,0.999505,0.916667,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,1,0,1,0.998515,0.916667,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6047,1,1,0,1.000000,0.583333,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
6048,0,0,0,1.000000,0.583333,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6049,1,0,0,1.000000,0.583333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6050,1,0,0,1.000000,0.583333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# split xy
dflist = df.values.tolist()
random.shuffle(dflist)
x = torch.Tensor([i[1:] for i in dflist])
y = torch.Tensor([i[1] for i in dflist])

In [58]:
# train/dev split
batch_size = 64

from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(x, y, test_size=0.33, random_state=42)

train_loader = torch.utils.data.DataLoader(dataset=tuple(zip(x_train,y_train)), batch_size=batch_size, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dataset=tuple(zip(x_dev,y_dev)), batch_size=batch_size, shuffle=False)

## 2. Neural Network Model

In [84]:
class Net(torch.nn.Module):

    def __init__(self, n_features, n_classes):
        super(Net, self).__init__()
        self.layer1 = torch.nn.Linear(n_features,2)
        self.layer2 = torch.nn.Linear(2, n_classes)
        self.out_act = torch.nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        out = self.out_act(x)
        return out

In [86]:
class Net(torch.nn.Module):
    
    def __init__(self, n_features, n_classes):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(n_features, 50)
        self.relu1 = torch.nn.ReLU()
        self.dout = torch.nn.Dropout(0.2)
        self.fc2 = torch.nn.Linear(50, 100)
        self.prelu = torch.nn.PReLU(1)
        self.out = torch.nn.Linear(100, n_classes)
        self.out_act = torch.nn.Sigmoid()
        
    def forward(self, x):
        a1 = self.fc1(x)
        h1 = self.relu1(a1)
        dout = self.dout(h1)
        a2 = self.fc2(dout)
        h2 = self.prelu(a2)
        a3 = self.out(h2)
        y = self.out_act(a3)
        return y

In [117]:
def test_net(test_loader, epoch):
    correct = 0
    total = 0
    test_loss = 0.
    test_acc = 0.
    
    for features, labels in test_loader:
        features = Variable(features)
        labels = Variable(labels)
        
        outputs = model(features)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        
        acc = torch.mean(torch.eq(torch.round(outputs), labels).float()).item()
        test_acc += acc
    
    return test_loss/len(test_loader), test_acc/len(test_loader)


def train_net(train_loader,dev_loader):
    train_rolling_loss = []
    dev_rolling_loss = []
    train_rolling_acc = []
    dev_rolling_acc = []

    for epoch in tqdm(range(int(epochs))):
        train_loss = 0.
        train_acc = 0.
        
        for i, (features, labels) in enumerate(train_loader):
            features = Variable(features)
            labels = Variable(labels)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
            acc = torch.mean(torch.eq(torch.round(outputs), labels).float()).item()
            train_acc += acc
            
        #print(train_acc, len(train_loader))
        
        train_rolling_acc.append(train_acc/len(train_loader))
        train_rolling_loss.append(train_loss/len(train_loader))

        ## calculating loss on dev set
        
        dev_loss,dev_acc = test_net(dev_loader, epoch)
        dev_rolling_loss.append(dev_loss)
        dev_rolling_acc.append(dev_acc)
        
        
    print("Train Accuracy: ",train_rolling_acc[-1],", Dev Accuracy: ",dev_rolling_acc[-1])
    print("Train Loss: ",train_rolling_loss[-1],", Dev Loss: ",dev_rolling_loss[-1])
    
    # plotting graphs on train loss and dev loss
    #plot_train_dev(train_rolling_loss,dev_rolling_loss,"loss")
    #plot_train_dev(train_rolling_acc,dev_rolling_acc,"accuracy")
    
    #plot_train_dev(train_rolling_loss,train_rolling_acc,"loss/accu")
    
    return train_rolling_acc[-1],dev_rolling_acc[-1]
    

In [119]:
#### RUN MODEL
epochs = 50

model = Net(int(x_train.shape[1]), 1) # n_features, n_classes
criterion = torch.nn.BCELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)

train_net(train_loader,dev_loader)

                                               

Train Accuracy:  0.916216777507649 , Dev Accuracy:  0.91357421875
Train Loss:  0.1861496042286934 , Dev Loss:  0.18803891714196652




(0.916216777507649, 0.91357421875)

In [111]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10)

total_acc = []
for fold, (train_index, test_index) in enumerate(kfold.split(x, y)):
        ### Dividing data into folds
    x_train_fold = x[train_index]
    x_test_fold = x[test_index]
    y_train_fold = y[train_index]
    y_test_fold = y[test_index]

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    test = torch.utils.data.TensorDataset(x_test_fold, y_test_fold)
    train_loader = torch.utils.data.DataLoader(train, batch_size = batch_size, shuffle = False)
    test_loader = torch.utils.data.DataLoader(test, batch_size = batch_size, shuffle = False)
    
    model = Net(int(x_train.shape[1]), 1) # n_features, n_classes
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
    
    train_accu, dev_accu = train_net(train_loader,dev_loader)
    total_acc.append(dev_accu)
    


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.891124636627907 , Dev Accuracy:  0.90472412109375
Train Loss:  0.20644757220911425 , Dev Loss:  0.17698822845704854


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.8997433684593024 , Dev Accuracy:  0.911651611328125
Train Loss:  0.2118855356130489 , Dev Loss:  0.18641184456646442


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.9009681271259175 , Dev Accuracy:  0.9090423583984375
Train Loss:  0.19893749118890874 , Dev Loss:  0.18019701819866896


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.8959376947131268 , Dev Accuracy:  0.9100799560546875
Train Loss:  0.21650379211750143 , Dev Loss:  0.1904235442634672


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.8972719516171965 , Dev Accuracy:  0.9127960205078125
Train Loss:  0.21285619785965876 , Dev Loss:  0.18703452590852976


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.8974138938410338 , Dev Accuracy:  0.9127960205078125
Train Loss:  0.21988790759513543 , Dev Loss:  0.19153769011609256


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.8945409832305686 , Dev Accuracy:  0.911285400390625
Train Loss:  0.2145338446594948 , Dev Loss:  0.1883515384979546


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.9058395842480105 , Dev Accuracy:  0.9093780517578125
Train Loss:  0.199056031269043 , Dev Loss:  0.185287645785138


  0%|          | 0/50 [00:00<?, ?it/s]         

Train Accuracy:  0.9114150748003361 , Dev Accuracy:  0.91357421875
Train Loss:  0.21587678634150084 , Dev Loss:  0.19971789815463126


                                               

Train Accuracy:  0.9154900333216024 , Dev Accuracy:  0.912811279296875
Train Loss:  0.19219747787817967 , Dev Loss:  0.19306443678215146


