In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Github/

/content/drive/MyDrive/Github


In [None]:
%cd "CS5488_Group_Project"

/content/drive/MyDrive/Github/CS5488_Group_Project


# pyspark SVM

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedKFold

## 1. Feature Engineering

### 1.1 Read cleaned data file

In [None]:
df = pd.read_csv("fake_job_postings_most_freq_text_cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,c_title,c_benefits,c_requirements,c_company_profile,c_description,location,telecommuting,has_company_logo,has_questions,employment_type,required_experience,industry,function,fraudulent
0,0,marketing intern,see job description,experience content management system major plu...,food created groundbreaking award winning cook...,food fast growing james beard award winning on...,"US, NY, New York",0,1,0,Other,Internship,Information Technology and Services,Marketing,0
1,1,customer service cloud video production,get usthrough part second team gain experience...,expect key responsibility communicate client s...,second world cloud video production service se...,organised focused vibrant awesome passion cust...,"NZ, , Auckland",0,1,0,Full-time,Not Applicable,Marketing and Advertising,Customer Service,0
2,2,commissioning machinery assistant cma,see job description,implement pre commissioning commissioning proc...,valor service provides workforce solution meet...,client located houston actively seeking experi...,"US, IA, Wever",0,1,0,Full-time,Mid-Senior level,Information Technology and Services,Information Technology,0
3,3,account executive washington dc,culture anything corporate—we collaborative cr...,education bachelor ’ master ’ gi business admi...,passion improving quality life geography heart...,company esri – environmental system research i...,"US, DC, Washington",0,1,0,Full-time,Mid-Senior level,Computer Software,Sales,0
4,4,bill review manager,full benefit offered,qualification rn license state texasdiploma ba...,spotsource solution llc global human capital m...,job title itemization review managerlocation f...,"US, FL, Fort Worth",0,1,1,Full-time,Mid-Senior level,Hospital & Health Care,Health Care Provider,0


### 1.2 Add words# as features

In [None]:
cols = ["c_title", "c_company_profile", "c_description", "c_requirements", "c_benefits"]
for c in cols:
    df[c] = df[c].fillna("") #replace nan

def extract_features(df):    
    for c in cols:
        #df[c+"_len"] = df[c].apply(lambda x : len(str(x))) #
        df[c+"_wc"] = df[c].apply(lambda x : len(str(x.split()))) #

    
extract_features(df)

### 1.3 Different types in special columns

In [6]:
cat_cols = ["employment_type", "required_experience","industry", "function"]
#,"industry", "function"
for c in cat_cols:
    encoded = pd.get_dummies(df[c]) 
    encoded.rename(columns=lambda x:x+c, inplace=True)
    df = pd.concat([df, encoded], axis=1) 

### 1.4 Drop unnecessary columns

In [7]:
drop_cols = ['c_title', 'location','c_company_profile', 'c_description', 'c_requirements', 'c_benefits']
drop_cols += cat_cols
df = df.drop(drop_cols, axis = 1)
df.head()

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,Public Relationsfunction,Purchasingfunction,Quality Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy/Planningfunction,Supply Chainfunction,Trainingfunction,Writing/Editingfunction
0,0,0,1,0,0,23,896,944,893,29,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,55,1192,2092,1424,1111,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,0,50,847,356,1471,29,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,0,44,637,2734,1563,909,...,0,0,0,0,1,0,0,0,0,0
4,4,0,1,1,0,29,1779,1604,850,30,...,0,0,0,0,0,0,0,0,0,0


### 1.5 Data normalization

In [8]:
# normalization
cols = ["c_title_wc", "c_company_profile_wc", "c_description_wc", "c_requirements_wc", "c_benefits_wc"]
#dd = df
for c in cols:
    df[c] =  (df[c] - df[c].min()) / (df[c].max() - df[c].min())
#norm_duration = (df - data.duration.min()) / (data.duration.max() - data.duration.min())
df

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,Public Relationsfunction,Purchasingfunction,Quality Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy/Planningfunction,Supply Chainfunction,Trainingfunction,Writing/Editingfunction
0,0,0,1,0,0,0.087912,0.140381,0.059421,0.081088,0.006138,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0.263736,0.187017,0.131836,0.129414,0.252103,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,0,0.236264,0.132661,0.022330,0.133691,0.006138,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,0,0.203297,0.099575,0.172333,0.142064,0.206183,...,0,0,0,0,1,0,0,0,0,0
4,4,0,1,1,0,0.120879,0.279502,0.101053,0.077175,0.006365,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17875,0,1,1,0,0.175824,0.266583,0.085851,0.117947,0.162310,...,0,0,0,0,1,0,0,0,0,0
17876,17876,0,1,1,0,0.098901,0.345833,0.084148,0.065071,0.138668,...,0,0,0,0,0,0,0,0,0,0
17877,17877,0,0,0,0,0.417582,0.038286,0.090015,0.105024,0.006138,...,0,0,0,0,0,0,0,0,0,0
17878,17878,0,0,1,0,0.087912,0.009926,0.030089,0.046414,0.048875,...,0,0,0,0,0,0,0,0,0,0


### 1.6 Create TF IDF Features

Read data from results of TFIDF processing step based on Hadoop

In [9]:
tfidf_path = "TFIDF_Hadoop/output3/part-r-00000"
tfidf = pd.read_csv(tfidf_path,sep='\s+|@',names=['word', 'doc','score'])
tfidf.head()

  return func(*args, **kwargs)


Unnamed: 0,word,doc,score
0,a,15381,0.006684
1,a,5220,0.009767
2,a,12270,0.006848
3,a,13242,0.011507
4,a,10171,0.007099


Get and select the top 1000 most frequent words

In [10]:
loc=tfidf['word'].value_counts()
loc=loc[:1000].keys()
loc = loc.tolist()
loc

['experience',
 'work',
 'team',
 'job',
 'company',
 'year',
 'skill',
 'help',
 'service',
 'u',
 'amp',
 'time',
 'see',
 'business',
 'new',
 'working',
 'description',
 'required',
 'customer',
 'communication',
 'looking',
 'opportunity',
 'management',
 'degree',
 'environment',
 'position',
 'client',
 'development',
 'technology',
 'get',
 'product',
 'ability',
 'based',
 'high',
 'candidate',
 'people',
 'knowledge',
 'including',
 'solution',
 'full',
 'need',
 'well',
 'provide',
 'process',
 'professional',
 'preferred',
 'project',
 'one',
 'office',
 'strong',
 'level',
 'url',
 'industry',
 'support',
 'also',
 'responsibility',
 'quality',
 'part',
 'best',
 'world',
 'offer',
 'role',
 'make',
 'must',
 'secure',
 'growing',
 'design',
 'application',
 'system',
 'benefit',
 'day',
 'excellent',
 'great',
 'software',
 'fast',
 'manager',
 'able',
 'safe',
 'employee',
 'join',
 'requirement',
 'within',
 'written',
 'sale',
 'web',
 'training',
 'data',
 'university

In [11]:
#loc = loc.tolist()
tfidf = tfidf[tfidf['word'].isin(loc)]
#tfidf = tfidf[lambda x: x['word'] == loc]
#tfidf=tfidf.loc(tfidf['word'].isin(loc))
tfidf

Unnamed: 0,word,doc,score
1369,ability,17288,0.005571
1370,ability,17285,0.001312
1371,ability,4971,0.001959
1372,ability,3640,0.003516
1373,ability,17280,0.004771
...,...,...,...
3101059,young,5800,0.003406
3101060,young,14488,0.003705
3101061,young,3867,0.007078
3101062,young,16427,0.003388


Create a spreadsheet-style pivot table as a DataFrame from tfidf

In [12]:
tfidf = tfidf.pivot_table(index="doc", columns="word",fill_value=0).reset_index()
tfidf.columns = tfidf.columns.droplevel(0)
tfidf

word,Unnamed: 1,ability,able,abroad,access,account,accounting,accuracy,accurate,achieve,...,workplace,world,worldwide,would,write,writing,written,year,york,young
0,0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.013825,0.0
1,1,0.000752,0.001079,0.000000,0.0,0.001630,0.000000,0.000000,0.0,0.000000,...,0.000000,0.009792,0.000000,0.001609,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,3,0.000833,0.000000,0.000000,0.0,0.019844,0.000000,0.000000,0.0,0.002052,...,0.000000,0.004335,0.005209,0.000000,0.002553,0.000000,0.001249,0.000926,0.000000,0.0
4,4,0.000000,0.001628,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000632,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17852,17875,0.000831,0.000000,0.000000,0.0,0.005401,0.000000,0.000000,0.0,0.000000,...,0.002859,0.003245,0.000000,0.000000,0.000000,0.000000,0.001246,0.000000,0.000000,0.0
17853,17876,0.001839,0.001318,0.000000,0.0,0.003984,0.025059,0.006498,0.0,0.000000,...,0.000000,0.000000,0.000000,0.001967,0.000000,0.002199,0.000000,0.001022,0.000000,0.0
17854,17877,0.001616,0.002316,0.000000,0.0,0.000000,0.011007,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001796,0.000000,0.0
17855,17878,0.000000,0.009452,0.005085,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [13]:
tfidf['']=tfidf[''].astype(int)
tfidf = tfidf.rename({'': 'Unnamed: 0'}, axis=1)


Merge the features of tfidf with other features

In [14]:
df = df.merge(tfidf, on='Unnamed: 0',how='right')
df.fillna(0)
df

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,workplace,world,worldwide,would,write,writing,written,year,york,young
0,0,0,1,0,0,0.087912,0.140381,0.059421,0.081088,0.006138,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.013825,0.0
1,1,0,1,0,0,0.263736,0.187017,0.131836,0.129414,0.252103,...,0.000000,0.009792,0.000000,0.001609,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,2,0,1,0,0,0.236264,0.132661,0.022330,0.133691,0.006138,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,3,0,1,0,0,0.203297,0.099575,0.172333,0.142064,0.206183,...,0.000000,0.004335,0.005209,0.000000,0.002553,0.000000,0.001249,0.000926,0.000000,0.0
4,4,0,1,1,0,0.120879,0.279502,0.101053,0.077175,0.006365,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000632,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17852,17875,0,1,1,0,0.175824,0.266583,0.085851,0.117947,0.162310,...,0.002859,0.003245,0.000000,0.000000,0.000000,0.000000,0.001246,0.000000,0.000000,0.0
17853,17876,0,1,1,0,0.098901,0.345833,0.084148,0.065071,0.138668,...,0.000000,0.000000,0.000000,0.001967,0.000000,0.002199,0.000000,0.001022,0.000000,0.0
17854,17877,0,0,0,0,0.417582,0.038286,0.090015,0.105024,0.006138,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001796,0.000000,0.0
17855,17878,0,0,1,0,0.087912,0.009926,0.030089,0.046414,0.048875,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


### 1.7 Process the names of the column 

In [15]:
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]#
#The isalnum() method returns True if all characters in the string are alphanumeric (either alphabets or numbers). If not, it returns False.

idd, target = "Unnamed__0", "fraudulent"
features = [f for f in df.columns if f not in [idd, target]]



In [16]:
#df_float = df
for col in df.columns:
    df[col] = df[col].astype(float)
df.dtypes

Unnamed__0          float64
telecommuting       float64
has_company_logo    float64
has_questions       float64
fraudulent          float64
                     ...   
writing             float64
written             float64
year                float64
york                float64
young               float64
Length: 1190, dtype: object

In [17]:
df.head()

Unnamed: 0,Unnamed__0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,workplace,world,worldwide,would,write,writing,written,year,york,young
0,0.0,0.0,1.0,0.0,0.0,0.087912,0.140381,0.059421,0.081088,0.006138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013825,0.0
1,1.0,0.0,1.0,0.0,0.0,0.263736,0.187017,0.131836,0.129414,0.252103,...,0.0,0.009792,0.0,0.001609,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,1.0,0.0,0.0,0.236264,0.132661,0.02233,0.133691,0.006138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,1.0,0.0,0.0,0.203297,0.099575,0.172333,0.142064,0.206183,...,0.0,0.004335,0.005209,0.0,0.002553,0.0,0.001249,0.000926,0.0,0.0
4,4.0,0.0,1.0,1.0,0.0,0.120879,0.279502,0.101053,0.077175,0.006365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000632,0.0,0.0


In [18]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 40 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 57.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845514 sha256=66e80f6a6f9df317fdd890f7303e19c048141b3b4249744f870413c49709b40b
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


### 1.8 Create SparkSession and convert Pandas Dataframe to PySpark DataFrame

In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("svm_1") \
    .getOrCreate()

sparkDF=spark.createDataFrame(df) 
sparkDF.printSchema()
#sparkDF.show()

root
 |-- Unnamed__0: double (nullable = true)
 |-- telecommuting: double (nullable = true)
 |-- has_company_logo: double (nullable = true)
 |-- has_questions: double (nullable = true)
 |-- fraudulent: double (nullable = true)
 |-- c_title_wc: double (nullable = true)
 |-- c_company_profile_wc: double (nullable = true)
 |-- c_description_wc: double (nullable = true)
 |-- c_requirements_wc: double (nullable = true)
 |-- c_benefits_wc: double (nullable = true)
 |-- Contractemployment_type: double (nullable = true)
 |-- Full_timeemployment_type: double (nullable = true)
 |-- Otheremployment_type: double (nullable = true)
 |-- Part_timeemployment_type: double (nullable = true)
 |-- Temporaryemployment_type: double (nullable = true)
 |-- Associaterequired_experience: double (nullable = true)
 |-- Directorrequired_experience: double (nullable = true)
 |-- Entry_levelrequired_experience: double (nullable = true)
 |-- Executiverequired_experience: double (nullable = true)
 |-- Internshiprequir

### 1.9 Created the feature vector

In [20]:
#Library that contains the functions for building vectors
from pyspark.ml.linalg import Vectors  
from pyspark.ml.feature import VectorAssembler 
#Created the feature vector

idd, target = "Unnamed__0", "fraudulent"
features = [f for f in df.columns if f not in [idd, target]]
vector_assembler = VectorAssembler(inputCols=features, outputCol="features")
#vector_assembler = VectorAssembler(inputCols=["telecommuting","has_company_logo","c_company_profile_wc"], outputCol="features")
df_temp = vector_assembler.transform(sparkDF)
df_temp.show(5)

+----------+-------------+----------------+-------------+----------+-------------------+--------------------+--------------------+-------------------+--------------------+-----------------------+------------------------+--------------------+------------------------+------------------------+----------------------------+---------------------------+------------------------------+----------------------------+-----------------------------+-----------------------------------+---------------------------------+------------------+-------------------------+--------------------------------------+-----------------+-------------------------+-------------------------------+------------------+----------------------------+---------------+---------------------+-----------------------+--------------------------+---------------------------------------+-----------------------+-----------------+-----------------------------------+-------------------------+------------------------------+--------------------

### 1.10 select label and feature columns

In [21]:
from pyspark.sql.functions import col
df_formatted = df_temp.select(col("fraudulent"),col("features"))
df_formatted.printSchema()
df_formatted.show(5)

root
 |-- fraudulent: double (nullable = true)
 |-- features: vector (nullable = true)

+----------+--------------------+
|fraudulent|            features|
+----------+--------------------+
|       0.0|(1188,[1,3,4,5,6,...|
|       0.0|(1188,[1,3,4,5,6,...|
|       0.0|(1188,[1,3,4,5,6,...|
|       0.0|(1188,[1,3,4,5,6,...|
|       0.0|(1188,[1,2,3,4,5,...|
+----------+--------------------+
only showing top 5 rows



In [22]:
df_SVM=df_formatted.selectExpr('features',"fraudulent as label")
df_SVM.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(1188,[1,3,4,5,6,...|  0.0|
|(1188,[1,3,4,5,6,...|  0.0|
|(1188,[1,3,4,5,6,...|  0.0|
|(1188,[1,3,4,5,6,...|  0.0|
|(1188,[1,2,3,4,5,...|  0.0|
+--------------------+-----+
only showing top 5 rows



### 1.11 Splits between training and testing data

In [23]:
#Splits between training and testing data
(train, test) = df_SVM.randomSplit([0.7, 0.3])
train.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
|(1188,[0,1,2,3,4,...|  0.0|
+--------------------+-----+
only showing top 10 rows



## 2. Defines the SVM Model

### 2.1 seperate training data and testing data

In [24]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel  #Library for SVM Model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator  #Used to find performance metrics
from pyspark.mllib.linalg import Vectors  #Dense vectors
from pyspark.mllib.util import MLUtils
df_train = MLUtils.convertVectorColumnsFromML(train, "features")
df_test = MLUtils.convertVectorColumnsFromML(test, "features")

In [25]:
df_train.show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### 2.2 Apply the label

In [26]:
from pyspark.mllib.regression import LabeledPoint  #Creates the "line" (characteristics and label) to be used

trainingData = df_train.rdd.map(lambda row:LabeledPoint(row.label,row.features))  #Apply the label to the training
testingData = df_test.rdd.map(lambda row:LabeledPoint(row.label,row.features))  #Apply the label to the test

In [27]:
for xs in trainingData.take(10):
    print(xs)

(0.0,(1188,[0,1,2,3,4,5,6,7,8,13,42,163,188,189,191,195,196,199,222,228,238,240,246,256,273,278,282,292,293,294,295,299,311,320,322,323,325,333,338,339,340,341,342,343,344,346,348,366,372,384,387,397,398,401,404,410,412,419,420,427,428,429,431,455,459,466,471,482,484,490,495,497,502,509,510,520,522,523,530,533,538,539,548,549,560,564,580,583,584,587,589,592,593,598,605,609,621,625,630,644,652,654,660,673,676,677,681,683,686,705,716,717,720,721,723,727,734,735,738,739,742,758,766,769,774,775,780,787,794,796,797,798,802,804,811,817,831,833,836,837,842,846,861,891,893,896,898,904,908,912,927,929,935,936,938,954,955,961,962,963,964,968,969,971,985,1002,1003,1019,1020,1024,1026,1027,1032,1038,1049,1053,1063,1064,1071,1074,1081,1084,1087,1095,1096,1097,1103,1108,1113,1123,1124,1129,1130,1133,1134,1138,1139,1143,1158,1162,1174,1177,1179,1181,1183],[1.0,1.0,1.0,0.22527472527472528,0.190168583582795,0.08988834920835173,0.15862759373862395,0.07183450784269152,1.0,1.0,1.0,1.0,0.00094979,0.0013613

### 2.3 Model build

In [28]:
#Model build
modelSVM = SVMWithSGD.train(trainingData, iterations=100)

### 2.4 Performing the prediction

In [29]:
#Performing the prediction
labelsAndPreds = testingData.map(lambda p: (p.label, modelSVM.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(testingData.count())
print("Error in prediction: ",trainErr)

Error in prediction:  0.05112668055292558
