In [4]:
import os
print(os.environ['SPARK_HOME'])
dataset_path="/home/ubuntu/challenge_1/"

/usr/local/software/spark


In [4]:
#import findspark
#findspark.init()
import pyspark

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("Adolfo-Dataset") \
    .getOrCreate()

In [None]:
spark.version

### 2. Data Loading

Data inspection shows that the data does not have a header.

In [6]:
def readLine(line):
    """ Parse a line from the input data
    Args:
        line (str): a line (row) of the input data file
    Returns:
        Row : row object containin the parsed elements from the line
        Note we are adding schema by directly transforming the str into double types
    """
    
    parts=re.split(",", line)
   
    # Read in each feature PLUS THE TARGET
    duration = parts[0]
    protocol_type = parts[1]
    service = parts[2]
    flag = parts[3]
    src_bytes = parts[4]
    dst_bytes = parts[5]
    land = parts[6]
    wrong_fragment = parts[7]
    urgent = parts[8]
    hot = parts[9]
    num_failed_logins = parts[10]
    logged_in = parts[11]
    num_compromised = parts[12]
    root_shell = parts[13]
    su_attempted = parts[14]
    num_root = parts[15]
    num_file_creations = parts[16]
    num_shells = parts[17]
    num_access_files = parts[18]
    num_outbound_cmds = parts[19]
    is_host_login = parts[20]
    is_guest_login = parts[21]
    count = parts[22]
    srv_count = parts[23]
    serror_rate = parts[24]
    srv_serror_rate = parts[25]
    rerror_rate = parts[26]
    srv_rerror_rate = parts[27]
    same_srv_rate = parts[28]
    diff_srv_rate = parts[29]
    srv_diff_host_rate = parts[30]
    dst_host_count = parts[31]
    dst_host_srv_count = parts[32]
    dst_host_same_srv_rate = parts[33]
    dst_host_diff_srv_rate = parts[34]
    dst_host_same_src_port_rate = parts[35]
    dst_host_srv_diff_host_rate = parts[36]
    dst_host_serror_rate = parts[37]
    dst_host_srv_serror_rate = parts[38]
    dst_host_rerror_rate = parts[39]
    dst_host_srv_rerror_rate = parts[40]
    connection = parts[41]
        
    return Row(
                duration= float(duration),
                protocol_type= str(protocol_type),
                service= str(service),
                flag= str(flag),
                src_bytes= float(src_bytes),
                dst_bytes= float(dst_bytes),
                land= str(land),
                wrong_fragment= float(wrong_fragment),
                urgent= float(urgent),
                hot= float(hot),
                num_failed_logins= float(num_failed_logins),
                logged_in= str(logged_in),
                num_compromised= float(num_compromised),
                root_shell= float(root_shell),
                su_attempted= float(su_attempted),
                num_root= float(num_root),
                num_file_creations= float(num_file_creations),
                num_shells= float(num_shells),
                num_access_files= float(num_access_files),
                num_outbound_cmds= float(num_outbound_cmds),
                is_host_login= str(is_host_login),
                is_guest_login= str(is_guest_login),
                count= float(count),
                srv_count= float(srv_count),
                serror_rate= float(serror_rate),
                srv_serror_rate= float(srv_serror_rate),
                rerror_rate= float(rerror_rate),
                srv_rerror_rate= float(srv_rerror_rate),
                same_srv_rate= float(same_srv_rate),
                diff_srv_rate= float(diff_srv_rate),
                srv_diff_host_rate= float(srv_diff_host_rate),
                dst_host_count= float(dst_host_count),
                dst_host_srv_count= float(dst_host_srv_count),
                dst_host_same_srv_rate= float(dst_host_same_srv_rate),
                dst_host_diff_srv_rate= float(dst_host_diff_srv_rate),
                dst_host_same_src_port_rate= float(dst_host_same_src_port_rate),
                dst_host_srv_diff_host_rate= float(dst_host_srv_diff_host_rate),
                dst_host_serror_rate= float(dst_host_serror_rate),
                dst_host_srv_serror_rate= float(dst_host_srv_serror_rate),
                dst_host_rerror_rate= float(dst_host_rerror_rate),
                dst_host_srv_rerror_rate= float(dst_host_srv_rerror_rate),
                connection= str(connection)
            
           )

In [7]:
# ---------
# Option 1 : use SparkContext and a function to map each line to a Row object
# ---------
from pyspark.sql import Row
import re
sc=spark.sparkContext
rdd = sc.textFile("file://"+dataset_path+"full.data")
#
df = rdd \
        .map(lambda line: readLine(line)) \
        .toDF()

In [17]:
# ---------
# Option 2 :  use SparkSession and infer schema, then add a header
# ---------

df2 = spark.read \
    .option("inferSchema", "true") \
    .csv("file://"+dataset_path+"full.data")

In [16]:
df2.show(5)

+---+---+----+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|_c0|_c1| _c2|_c3|_c4|  _c5|_c6|_c7|_c8|_c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|   _c41|
+---+---+----+---+---+-----+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+-------+
|  0|tcp|http| SF|215|45076|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   1|   1|0.00|0.00|0.00|0.00|1.00|0.00|0.00|   0|   0|0.00|0.00|0.00|0.00|0.00|0.00|0.00|0.00|normal.|
|  0|tcp|http| SF|162| 4528|  0|  0|  0|  0|   0|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   2|   2|0.00|0.00|0.00|0.00|1.00|0.00|0.00|   1|   1|1.00|0.00