In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

## Setting the seeds for reproducibility

In [2]:
# Seed value
seed_value= 0

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

## Creating the datasets

### Creating the training set

In [3]:
cwd = os.getcwd()
train_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/6_5000/train_6_5000.csv'
train_df = pd.read_csv(train_file_path)

In [4]:
train_df.head()

Unnamed: 0,label,byte1,byte2,byte3,byte4,byte5,byte6,byte7,byte8,byte9,...,byte1471,byte1472,byte1473,byte1474,byte1475,byte1476,byte1477,byte1478,byte1479,byte1480
0,Facebook,1,187,244,116,158,250,26,183,171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,YouTube,1,187,156,22,89,170,254,4,137,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,YouTube,1,187,236,103,231,92,105,103,177,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,YouTube,1,187,187,69,162,233,57,169,80,...,3.0,174.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Facebook,133,14,1,187,123,55,245,197,86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
## converting the labels into unique integers
train_df['label'] = train_df.label.astype('category').cat.codes

### Creating the validation set

In [6]:
validation_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/6_5000/val_6_5000.csv'
validation_df = pd.read_csv(validation_file_path)

In [7]:
## converting the labels into unique integers
validation_df['label'] = validation_df.label.astype('category').cat.codes

### Creating the testing set

In [8]:
test_file_path = '/'.join(cwd.split('/')[:-2]) + '/preprocessing/data/6_5000/test_6_5000.csv'
test_df = pd.read_csv(test_file_path)

In [9]:
## converting the labels into unique integers
test_df['label'] = test_df.label.astype('category').cat.codes

## Training the svc using one vs many

In [10]:
lin_clf = svm.LinearSVC(max_iter=5000, dual=False)

In [11]:
X_train = train_df.drop('label', axis=1)
X_train = X_train / 255.0

In [12]:
y_train = train_df['label']

In [13]:
lin_clf.fit(X_train,y_train)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

## Evaluating the models performance on the validation set

In [14]:
X_validation = validation_df.drop('label', axis=1)
X_validation = X_validation / 255.0

In [15]:
y_validation = validation_df['label']

In [16]:
y_pred_validation = lin_clf.predict(X_validation)

In [17]:
print("Validation Accuracy:",accuracy_score(y_validation, y_pred_validation))

Validation Accuracy: 0.7922916666666666


## Evaluating the models performance on the test set

In [18]:
X_test = test_df.drop('label', axis=1)
X_test = X_test / 255.0

In [19]:
y_test = test_df['label']

In [20]:
y_pred_test = lin_clf.predict(X_test)

In [21]:
print("Test Accuracy:",accuracy_score(y_test, y_pred_test))

Test Accuracy: 0.788


# Evaluating resource consumption
We will be evaluating the computational efficiency of a model by calculating amount of memory and time needed to make a prediction.

## Creating a sample packet

In [22]:
sample_packet = []
for i in range(1480):
    sample_packet.append(random.random())
sample_packet = np.array(sample_packet).reshape(1, -1)

## Memory

In [23]:
import tracemalloc

In [24]:
tracemalloc.start()

y_pred_test = lin_clf.predict(sample_packet)

current, peak = tracemalloc.get_traced_memory()
current = (current / 10**6)
peak = (peak / 10**6)
print("The amount of memory needed to make a single preiction")
print(f"Current memory usage is {current}MB; Peak was {peak}MB")
tracemalloc.stop()

The amount of memory needed to make a single preiction
Current memory usage is 0.00111MB; Peak was 0.013338MB


## Time


In [25]:
import time

start = time.time()
y_pred_test = lin_clf.predict(sample_packet)
end = time.time()
print("The time taken to make a prediction: {}".format((end - start)))

The time taken to make a prediction: 0.0003771781921386719
