○ One model should use client IP to predict the country from which the request originated. You can use any model you want but you should be able to achieve at least 99% accuracy for this exercise.

○ The second model should use any of the available fields to predict income. Once again you get to choose what kind of model you want to use. You should aim for 80+% accuracy for this second model but report any problems you run into, as fitting a model for this data is subject to the vagaries of the random seed you may have picked.

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import re

In [36]:
requests = pd.read_csv('requests.csv')
requests.head()

Unnamed: 0,Row,Country,Gender,Age,Income,Client_Ip,Date
0,1,Venezuela,Female,0-16,10k-20k,11.128.127.220,10/29/23 2:00
1,2,Venezuela,Female,0-16,10k-20k,11.128.127.220,10/29/23 2:00
2,3,Qatar,Male,46-55,20k-40k,96.212.217.245,10/29/23 9:00
3,4,Qatar,Male,46-55,20k-40k,96.212.217.245,10/29/23 9:00
4,5,Trinidad and Tobago,Male,17-25,60k-100k,149.121.160.201,10/29/23 16:00


In [48]:
failure = pd.read_csv('failure.csv')
failure.head()

Unnamed: 0,Row,Date,File,Status
0,1,10/29/23 20:00,4511.html,200
1,2,10/29/23 20:00,4511.html,200
2,3,10/29/23 15:00,3339.html,400
3,4,10/29/23 15:00,3339.html,400
4,5,10/29/23 8:00,4171.html,200


In [49]:
# cleaning data, normalization, encoding, removing useless columns
# drop rows with missing values
requests = requests.dropna()
failure = failure.dropna()

In [50]:
# we dont need the row number and date

requests = requests.drop(['Row', 'Date'], axis=1)
failure = failure.drop(['Row', 'Date'], axis=1)

KeyError: "['Row', 'Date'] not found in axis"

In [51]:
print(requests, failure, failure['Status'].unique())

                    Country    Age    Income        Client_Ip  Gender_Female  \
0                 Venezuela   0-16   10k-20k   11.128.127.220              1   
1                 Venezuela   0-16   10k-20k   11.128.127.220              1   
2                     Qatar  46-55   20k-40k   96.212.217.245              0   
3                     Qatar  46-55   20k-40k   96.212.217.245              0   
4       Trinidad and Tobago  17-25  60k-100k  149.121.160.201              0   
...                     ...    ...       ...              ...            ...   
104822              Tunisia  46-55     250k+    126.176.13.60              0   
104823                Haiti  36-45   20k-40k    10.209.232.99              1   
104824                Haiti  36-45   20k-40k    10.209.232.99              1   
104825     Marshall Islands  17-25   20k-40k    28.116.44.207              0   
104826     Marshall Islands  17-25   20k-40k    28.116.44.207              0   

        Gender_Male  
0                

In [52]:
# one hot encoding for genders in requests
requests = pd.get_dummies(requests, columns=['Gender'])

# one hot encoding for status codes in failures
failure = pd.get_dummies(failure, columns=['Status'])

KeyError: "None of [Index(['Gender'], dtype='object')] are in the [columns]"

In [53]:
requests

Unnamed: 0,Country,Age,Income,Client_Ip,Gender_Female,Gender_Male
0,Venezuela,0-16,10k-20k,11.128.127.220,1,0
1,Venezuela,0-16,10k-20k,11.128.127.220,1,0
2,Qatar,46-55,20k-40k,96.212.217.245,0,1
3,Qatar,46-55,20k-40k,96.212.217.245,0,1
4,Trinidad and Tobago,17-25,60k-100k,149.121.160.201,0,1
...,...,...,...,...,...,...
104822,Tunisia,46-55,250k+,126.176.13.60,0,1
104823,Haiti,36-45,20k-40k,10.209.232.99,1,0
104824,Haiti,36-45,20k-40k,10.209.232.99,1,0
104825,Marshall Islands,17-25,20k-40k,28.116.44.207,0,1


In [11]:
# number_of_unique_ages = len(requests['Age'].unique())
# number_of_unique_incomes = len(requests['Income'].unique())

# # digitize the age and income columns
# pattern = re.compile(r'(\d+)[-, +]')
# length_df = len(requests)
# #print(requests)

# for row in range(length_df):
#     age = requests['Age'][row]
#     income = requests['Income'][row]
#     cleaned = re.split(r"[-+]", age)
#     cleaned = [a for a in  cleaned if a != '']
#     if len(cleaned) == 1:
#         average_age = int(cleaned[0])
#     else:
#         average_age = (int(cleaned[0]) + int(cleaned[1])) / 2
#     requests['Age'][row] = average_age


In [None]:
age_bin = ['0-16', '17-25', '26-35', '36-45', '46-55', '56-65', '66-75', '76+']
age_bins = [0, 16, 25, 35, 45, 55, 65, 75, 100]
requests['Age'] = pd.cut(requests['Age'], bins=age_bins, labels=age_bin)
print(requests)

                    Country    Age    Income        Client_Ip  Gender_Female  \
0                 Venezuela   0-16   10k-20k   11.128.127.220              1   
1                 Venezuela   0-16   10k-20k   11.128.127.220              1   
2                     Qatar  46-55   20k-40k   96.212.217.245              0   
3                     Qatar  46-55   20k-40k   96.212.217.245              0   
4       Trinidad and Tobago  17-25  60k-100k  149.121.160.201              0   
...                     ...    ...       ...              ...            ...   
104822              Tunisia  46-55     250k+    126.176.13.60              0   
104823                Haiti  36-45   20k-40k    10.209.232.99              1   
104824                Haiti  36-45   20k-40k    10.209.232.99              1   
104825     Marshall Islands  17-25   20k-40k    28.116.44.207              0   
104826     Marshall Islands  17-25   20k-40k    28.116.44.207              0   

        Gender_Male  
0                

In [12]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import re

# # Split data into features and labels
# X = requests.drop(['Country'], axis=1)
# y = requests['Country']

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import numpy as np
from numpy.random import randint,rand,seed,normal,permutation
import torch
import torchvision
import torch.nn.functional as F
from torch.utils.data import random_split,Dataset,DataLoader
from torchvision import datasets, transforms
from torch import nn, optim

import torchvision.transforms as T
from scipy.special import softmax

In [15]:
from sklearn.datasets import make_blobs

number_of_unique_countries = len(requests['Country'].unique())
length_df = len(requests)
print(length_df, number_of_unique_countries)

Xblob, yblob = make_blobs(n_samples=length_df, centers=number_of_unique_countries, random_state=0)
Xblob = torch.tensor(Xblob).float()
yblob = torch.tensor(yblob).long()
print(Xblob.size(), yblob.size())
print(Xblob[:5], yblob[:5])

plt.figure(1)
colors = ['red', 'green', 'blue', 'yellow', 'orange', 'purple', 'pink', 'black', 'brown', 'gray']

for k, col in enumerate(colors):
    cluster_data = yblob == k
    plt.scatter(Xblob[cluster_data, 0], Xblob[cluster_data, 1], c=col, marker='o', s=10)



104827 197


In [19]:
from sklearn.model_selection import train_test_split

X = requests.drop(['Country'], axis=1)
y = requests['Country']
# 20% of the set is for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Split the 80% into 60% training and 20% validation 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=43)
# final split is 60% training, 20% validation, 20% testing

print(X_train.shape, X_val.shape, X_test.shape)

(62895, 5) (20966, 5) (20966, 5)


In [30]:
# class IPModel(nn.Module):
#     def __init__(self):
#         super(IPModel, self).__init__()
#         self.hidden1 = nn.Linear(2, 10)
#         self.hidden2 = nn.Linear(10, 4)
    
#     def forward(self, x):
#         x = self.hidden1(x)
#         x = F.sigmoid(x)
#         x = self.hidden2(x)
#         return x
# IP_model = IPModel()
# print(IP_model)

IPModel(
  (hidden1): Linear(in_features=2, out_features=10, bias=True)
  (hidden2): Linear(in_features=10, out_features=4, bias=True)
)


In [None]:
# num_epochs = 100
# loss_function = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(IP_model.parameters(), lr=0.001)

# for epoch in (range(num_epochs)):
#     IP_model.train()
#     for i, (X, y) in enumerate(zip(Xblob, yblob)):
#         optimizer.zero_grad()
#         output = IP_model(X)
#         loss = loss_function(output, y)
#         loss.backward()
#         optimizer.step() 

In [43]:
from sklearn import tree

countries = requests['Country']
client_ip = requests['Client_Ip']

X_train, X_test, y_train, y_test = train_test_split(client_ip, countries, test_size=0.2, random_state=42)

In [47]:
clf = tree.DecisionTreeClassifier()

print(client_ip)
clf = clf.fit(X_train, y_train)

0          11.128.127.220
1          11.128.127.220
2          96.212.217.245
3          96.212.217.245
4         149.121.160.201
               ...       
104822      126.176.13.60
104823      10.209.232.99
104824      10.209.232.99
104825      28.116.44.207
104826      28.116.44.207
Name: Client_Ip, Length: 104827, dtype: object


ValueError: could not convert string to float: '125.248.87.108'