# Projeto utilizando Clustering


### Instalação dos componentes relacionado ao Spark

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [2]:
import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
findspark.init('spark-2.4.4-bin-hadoop2.7')

## Importação das Bibliotecas

In [3]:
import pyspark
from pyspark.sql import SparkSession

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler

from pyspark.ml.clustering import KMeans

## Criando uma sessão e importando a base 

In [4]:
spark = SparkSession.builder.appName('clustering').getOrCreate()

In [5]:
PATH = '/content/drive/MyDrive/Colab Notebooks/Estudos/Spark/Clustering/base/hack_data.csv'
dataset = spark.read.csv(PATH, header = True, inferSchema = True)
dataset.head()

Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)

In [6]:
dataset.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

In [7]:
dataset.columns

['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

## Transformando os dados

### Vetorizando os dados

In [8]:
feat_cols = ['Session_Connection_Time',
              'Bytes Transferred',
              'Kali_Trace_Used',
              'Servers_Corrupted',
              'Pages_Corrupted',
              'WPM_Typing_Speed']

vec_assembler = VectorAssembler(inputCols = feat_cols, outputCol = 'features')

In [9]:
final_data = vec_assembler.transform(dataset)

### Padronizando os valores

In [10]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures', 
                        withStd = True, withMean = False)

In [11]:
scalerModel = scaler.fit(final_data)

In [12]:
cluster_final_data = scalerModel.transform(final_data)

## Modelo

In [13]:
# Criação de divisão de 2 e 3 grupos para testes

k_means3 = KMeans(featuresCol = 'scaledFeatures', k = 3)
k_means2 = KMeans(featuresCol = 'scaledFeatures', k = 2)

In [14]:
model_k3 = k_means3.fit(cluster_final_data)
model_k2 = k_means2.fit(cluster_final_data)

In [15]:
# Calculando a soma dos erros quadrados

wsssw_k3 = model_k3.computeCost(cluster_final_data)
wsssw_k2 = model_k2.computeCost(cluster_final_data)

In [16]:
print('With k=3')
print('O valor da soma dos erros quadrados = {}'.format(wsssw_k3))
print('-' * 60)
print('With k=2')
print('O valor da soma dos erros quadrados = {}'.format(wsssw_k2))

With k=3
O valor da soma dos erros quadrados = 434.75507308487647
------------------------------------------------------------
With k=2
O valor da soma dos erros quadrados = 601.7707512676716


### Analisando a melhor quantidade de clusters

In [17]:
for i in range(2, 10):
  k_means = KMeans(featuresCol = 'scaledFeatures', k = i)
  model = k_means.fit(cluster_final_data)
  wssse = model.computeCost(cluster_final_data)

  print('With k = {}'.format(i))
  print('Within Set Sum of Squared Erros = {}'.format(wssse))
  print('-' * 60)

With k = 2
Within Set Sum of Squared Erros = 601.7707512676716
------------------------------------------------------------
With k = 3
Within Set Sum of Squared Erros = 434.75507308487647
------------------------------------------------------------
With k = 4
Within Set Sum of Squared Erros = 267.1336116887891
------------------------------------------------------------
With k = 5
Within Set Sum of Squared Erros = 252.96510328402076
------------------------------------------------------------
With k = 6
Within Set Sum of Squared Erros = 232.74699654312008
------------------------------------------------------------
With k = 7
Within Set Sum of Squared Erros = 223.33347954318356
------------------------------------------------------------
With k = 8
Within Set Sum of Squared Erros = 209.19327182734682
------------------------------------------------------------
With k = 9
Within Set Sum of Squared Erros = 203.55896242419647
------------------------------------------------------------


### Analisando os ataques que devem ser numerados igualmente entre os hackers

In [18]:
k_means4 = KMeans(featuresCol = 'scaledFeatures', k = 4)
k_means6 = KMeans(featuresCol = 'scaledFeatures', k = 6)

model_k4 = k_means4.fit(cluster_final_data)
model_k6 = k_means6.fit(cluster_final_data)

In [19]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [20]:
model_k4.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   79|
|         3|   84|
|         2|   88|
|         0|   83|
+----------+-----+



In [21]:
model_k6.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   40|
|         3|   49|
|         5|   48|
|         4|   84|
|         2|   79|
|         0|   34|
+----------+-----+



# Resultados 
Com os resultados mostrados acima, temos que foram 2 hackers como foi mostrado no modelo de clusters!