In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [7]:
requests = pd.read_csv('requests.csv')
failures = pd.read_csv('failure.csv')
requests.head()

Unnamed: 0,Row,Country,Gender,Age,Income,Client_Ip,Date
0,1,Venezuela,Female,0-16,10k-20k,11.128.127.220,10/29/23 2:00
1,2,Venezuela,Female,0-16,10k-20k,11.128.127.220,10/29/23 2:00
2,3,Qatar,Male,46-55,20k-40k,96.212.217.245,10/29/23 9:00
3,4,Qatar,Male,46-55,20k-40k,96.212.217.245,10/29/23 9:00
4,5,Trinidad and Tobago,Male,17-25,60k-100k,149.121.160.201,10/29/23 16:00


In [8]:
requests.dropna()
failures.dropna()
requests.drop(['Row'], axis=1, inplace=True)
# append failure file to requests file
files = failures['File']
requests['File'] = files
requests.drop_duplicates(inplace=True)
print(requests.shape)

(60497, 7)


In [9]:
countries = requests['Country']
client_ip = requests['Client_Ip']

def ip_to_number(ip):
    parts = ip.split('.')
    return int(parts[0]) * (256 * 256 * 256) + int(parts[1]) * (256 * 256) + int(parts[2]) * 256 + int(parts[3])

client_ip = client_ip.apply(lambda x: int(''.join([i for i in x if i.isdigit()])))

X_train, X_test, y_train, y_test = train_test_split(client_ip, countries, test_size=0.2)

# reshape to 2D array
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)

# Use a decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(f"Accuracy: {clf.score(X_test, y_test)}")

#import random forest classifier
country_model = RandomForestClassifier(n_estimators=100, random_state=0)
country_model.fit(X_train, y_train)
print(f"Accuracy: {country_model.score(X_test, y_test)}")

Accuracy: 0.9961157024793389
Accuracy: 0.9961157024793389


### Income Model

In [103]:
features = requests.drop(['Date', 'Income', 'File', 'Client_Ip'], axis=1)
income = requests['Income']

from sklearn.preprocessing import LabelEncoder
# use LabelEncoder to convert categorical data to numeric
for col in features.columns:
    if features[col].dtype == type(object):
        le = LabelEncoder()
        features[col] = le.fit_transform(features[col])
for col in features.columns:
    features[col] = features[col].astype('category')
income = LabelEncoder().fit_transform(income)


# features['Country'] = pd.Categorical(features['Country']).codes
# features['Gender'] = pd.Categorical(features['Gender']).codes
# features['Age'] = pd.Categorical(features['Age']).codes
# income = pd.Categorical(income).codes

X_train, X_test, y_train, y_test = train_test_split(features, income, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.2)

print(X_train.shape)
print(X_train)
print(len(X_train['Country'].unique()))
#Use a decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(f"Accuracy: {clf.score(X_test, y_test)}")

(38717, 3)
      Country Gender Age
45602     154      1   5
66250      70      0   1
28214      72      1   1
90359     174      0   5
59124     129      0   5
...       ...    ...  ..
41522     131      0   7
20495     182      1   0
39409     138      0   7
35144     138      0   4
47570     178      0   7

[38717 rows x 3 columns]
197
Accuracy: 0.22520661157024793


In [85]:
# categorical embedding for columns with more than 2 unique values
embedded_cols = {n: len(col.cat.categories) for n, col in features.items() if len(col.cat.categories) > 2}
embedded_cols_names = embedded_cols.keys()
len(features.columns) - len(embedded_cols_names)

# embedding sizes
embedded_size = [(n_categories, min(50, (n_categories + 1) // 2)) for _, n_categories in embedded_cols.items()]
embedded_size

[(197, 50), (8, 4)]

In [88]:
import torch 
import torchvision
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch import nn, optim

In [89]:
class CountriesDataset(Dataset):
    def __init__(self, X, y, embedded_col_names):
        X1 = X.copy()
        self.X1 = X.loc[:, embedded_col_names].copy().values.astype(np.int64)
        self.X2 = X.drop(columns=embedded_col_names).copy().values.astype(np.float32)
        self.y = y
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return [self.X1[idx], self.X2[idx]], self.y[idx]
train_ds = CountriesDataset(X_train, y_train, embedded_cols_names)
valid_ds = CountriesDataset(X_validation, y_validation, embedded_cols_names)

In [95]:
class CountriesOutcomeModel(nn.Module):
    def __init__(self, embedded_size, n_cont):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(categories, size) for categories, size in embedded_size])
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 100)
        self.lin2 = nn.Linear(100, 50)
        self.lin3 = nn.Linear(50, 2)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(100)
        self.bn3 = nn.BatchNorm1d(50)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)
    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)
        return x
    
model = CountriesOutcomeModel(embedded_size, 1)

In [None]:
def get_optimzer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimzer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optimzer

def train_model(model, optimizer, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = y.shape[0]
        output = model(x[0], x[1])
        loss = F.cross_entropy(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for x1, x2, y in valid_dl:
        current_batch_size = y.shape[0]
        out = model(x1, x2)
        loss = F.cross_entropy(out, y)
        sum_loss += current_batch_size*(loss.item())
        total += current_batch_size
        pred = torch.max(out, 1)[1]
        correct += (pred == y).float().sum().item()
    print("valid loss %.3f and accuracy %.3f" % (sum_loss/total, correct/total))
    return sum_loss/total, correct/total


batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimzer(model, lr = lr, wd = wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dl)
        print("training loss %.3f" % loss)
        val_loss(model, valid_dl)
train_loop(model, epochs=100, lr=0.01, wd=0.0001)


In [65]:
class Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = Dataset(X_train, y_train)
val_dataset = Dataset(X_validation, y_validation)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)

print(len(train_loader))
print(train_loader.dataset)

605
<__main__.Dataset object at 0x16800c950>


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(5, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=1)

model = Net()
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 100 
train_losses, val_losses = [], []
elem1 = train_loader.dataset.X

for e in range(epochs):
    running_loss = 0
    for X, y in train_loader:
        optimizer.zero_grad() 
        output = model(X.float())
        y = y.long()

        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    else:
        val_loss = 0
        accuracy = 0
    
        with torch.no_grad():
            model.eval()
            for X, y in val_loader:
                output = model(X.float())
                val_loss += criterion(output, y)
                
                top_p, top_class = output.topk(1, dim=1)
                equals = top_class == y.view(*top_class.shape)
                accuracy += torch.mean(equals.type(torch.FloatTensor))
        
        model.train()
        
        train_losses.append(running_loss/len(train_loader))
        val_losses.append(val_loss/len(val_loader))
        
        print("Epoch: {}/{}.. ".format(e+1, epochs),
             "Training Loss: {:.3f}.. ".format(running_loss/len(train_loader)),
             "Validation Loss: {:.3f}.. ".format(val_loss/len(val_loader)),
             "Validation Accuracy: {:.3f}".format(accuracy/len(val_loader)))

plt.plot(train_losses, label='Training loss')
plt.plot(val_losses, label='Validation loss')
plt.legend(frameon=False)