In [24]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RepeatedKFold

In [25]:
df = pd.read_csv("fake_job_postings_most_freq_text_cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,c_title,c_benefits,c_requirements,c_company_profile,c_description,location,telecommuting,has_company_logo,has_questions,employment_type,required_experience,industry,function,fraudulent
0,0,marketing intern,see job description,experience content management system major plu...,food created groundbreaking award winning cook...,food fast growing james beard award winning on...,"US, NY, New York",0,1,0,Other,Internship,Information Technology and Services,Marketing,0
1,1,customer service cloud video production,get usthrough part second team gain experience...,expect key responsibility communicate client s...,second world cloud video production service se...,organised focused vibrant awesome passion cust...,"NZ, , Auckland",0,1,0,Full-time,Not Applicable,Marketing and Advertising,Customer Service,0
2,2,commissioning machinery assistant cma,see job description,implement pre commissioning commissioning proc...,valor service provides workforce solution meet...,client located houston actively seeking experi...,"US, IA, Wever",0,1,0,Full-time,Mid-Senior level,Information Technology and Services,Information Technology,0
3,3,account executive washington dc,culture anything corporate—we collaborative cr...,education bachelor ’ master ’ gi business admi...,passion improving quality life geography heart...,company esri – environmental system research i...,"US, DC, Washington",0,1,0,Full-time,Mid-Senior level,Computer Software,Sales,0
4,4,bill review manager,full benefit offered,qualification rn license state texasdiploma ba...,spotsource solution llc global human capital m...,job title itemization review managerlocation f...,"US, FL, Fort Worth",0,1,1,Full-time,Mid-Senior level,Hospital & Health Care,Health Care Provider,0


In [26]:
cols = ["c_title", "c_company_profile", "c_description", "c_requirements", "c_benefits"]
for c in cols:
    df[c] = df[c].fillna("") #replace nan

def extract_features(df):    
    for c in cols:
        #df[c+"_len"] = df[c].apply(lambda x : len(str(x))) #文檔個數
        df[c+"_wc"] = df[c].apply(lambda x : len(str(x.split()))) #詞數

    
extract_features(df)

In [27]:
df['combined_text'] = df['c_company_profile'] + " " + df['c_description'] + " " + df['c_requirements'] + " " + df['c_benefits']
#
n_features = {
    "c_title" : 100,
    "combined_text" : 500
}

for c, n in n_features.items(): #key, value
    #如果将 n 传递给 max_features，则意味着从文本文档中最常见的 n 个单词中创建一个特征矩阵。
    tfidf = TfidfVectorizer(max_features=n, norm='l2', stop_words = 'english') #max_features?
    tfidf.fit(df[c])
    tfidf_train = np.array(tfidf.transform(df[c]).toarray(), dtype=np.float16)

    for i in range(n_features[c]):#循環n次,每行有100+500個feature
        df[c + '_tfidf_' + str(i)] = tfidf_train[:, i]

In [28]:
cat_cols = ["employment_type", "required_experience","industry", "function"]
for c in cat_cols:
    encoded = pd.get_dummies(df[c]) #變成虛擬或者指標變量
    encoded.rename(columns=lambda x:x+c, inplace=True)
    df = pd.concat([df, encoded], axis=1) #新feature添加進去

In [29]:
drop_cols = ['c_title', 'location','c_company_profile', 'c_description', 'c_requirements', 'c_benefits', 'combined_text']
drop_cols += cat_cols
df = df.drop(drop_cols, axis = 1)
df.head()

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,Public Relationsfunction,Purchasingfunction,Quality Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy/Planningfunction,Supply Chainfunction,Trainingfunction,Writing/Editingfunction
0,0,0,1,0,0,23,896,944,893,29,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,55,1192,2092,1424,1111,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,0,50,847,356,1471,29,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,0,44,637,2734,1563,909,...,0,0,0,0,1,0,0,0,0,0
4,4,0,1,1,0,29,1779,1604,850,30,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# normalization
cols = ["c_title_wc", "c_company_profile_wc", "c_description_wc", "c_requirements_wc", "c_benefits_wc"]
#dd = df
for c in cols:
    df[c] =  (df[c] - df[c].min()) / (df[c].max() - df[c].min())
#norm_duration = (df - data.duration.min()) / (data.duration.max() - data.duration.min())
df

Unnamed: 0.1,Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,Public Relationsfunction,Purchasingfunction,Quality Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy/Planningfunction,Supply Chainfunction,Trainingfunction,Writing/Editingfunction
0,0,0,1,0,0,0.087912,0.140381,0.059421,0.081088,0.006138,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0.263736,0.187017,0.131836,0.129414,0.252103,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,0,0.236264,0.132661,0.022330,0.133691,0.006138,...,0,0,0,0,0,0,0,0,0,0
3,3,0,1,0,0,0.203297,0.099575,0.172333,0.142064,0.206183,...,0,0,0,0,1,0,0,0,0,0
4,4,0,1,1,0,0.120879,0.279502,0.101053,0.077175,0.006365,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17875,0,1,1,0,0.175824,0.266583,0.085851,0.117947,0.162310,...,0,0,0,0,1,0,0,0,0,0
17876,17876,0,1,1,0,0.098901,0.345833,0.084148,0.065071,0.138668,...,0,0,0,0,0,0,0,0,0,0
17877,17877,0,0,0,0,0.417582,0.038286,0.090015,0.105024,0.006138,...,0,0,0,0,0,0,0,0,0,0
17878,17878,0,0,1,0,0.087912,0.009926,0.030089,0.046414,0.048875,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]#
#The isalnum() method returns True if all characters in the string are alphanumeric (either alphabets or numbers). If not, it returns False.

idd, target = "Unnamed__0", "fraudulent"
features = [f for f in df.columns if f not in [idd, target]]

X = df[features]
y = df[target]
X

Unnamed: 0,telecommuting,has_company_logo,has_questions,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,c_title_tfidf_0,c_title_tfidf_1,...,Public_Relationsfunction,Purchasingfunction,Quality_Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy_Planningfunction,Supply_Chainfunction,Trainingfunction,Writing_Editingfunction
0,0,1,0,0.087912,0.140381,0.059421,0.081088,0.006138,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0.263736,0.187017,0.131836,0.129414,0.252103,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0.236264,0.132661,0.022330,0.133691,0.006138,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0.203297,0.099575,0.172333,0.142064,0.206183,0.0,0.700684,...,0,0,0,0,1,0,0,0,0,0
4,0,1,1,0.120879,0.279502,0.101053,0.077175,0.006365,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0,1,1,0.175824,0.266583,0.085851,0.117947,0.162310,0.0,0.698242,...,0,0,0,0,1,0,0,0,0,0
17876,0,1,1,0.098901,0.345833,0.084148,0.065071,0.138668,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
17877,0,0,0,0.417582,0.038286,0.090015,0.105024,0.006138,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0
17878,0,0,1,0.087912,0.009926,0.030089,0.046414,0.048875,0.0,0.000000,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in df.columns]#
#The isalnum() method returns True if all characters in the string are alphanumeric (either alphabets or numbers). If not, it returns False.
for f in df.columns:
    print(f)
idd, target = "Unnamed__0", "fraudulent"
features = [f for f in df.columns if f not in [idd, target]]

X = df[features]
y = df[target]
'''
kf = RepeatedKFold(n_splits=3, n_repeats=1, random_state=0) #
auc_buf = []   
cnt = 0
for train_index, valid_index in kf.split(X):
    print('Fold {}'.format(cnt + 1))

    train_x,train_y = X.loc[train_index], y.loc[train_index]
    test_x, test_y = X.loc[valid_index], y.loc[valid_index]
    
    clf = LogisticRegression(max_iter = 5000).fit(train_x, train_y)
    preds = clf.predict(test_x)
    
    auc = roc_auc_score(test_y, preds)
    print('{} AUC: {}'.format(cnt, auc))
    auc_buf.append(auc)

    cnt += 1

auc_mean = np.mean(auc_buf)
auc_std = np.std(auc_buf)
print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))
'''

Unnamed__0
telecommuting
has_company_logo
has_questions
fraudulent
c_title_wc
c_company_profile_wc
c_description_wc
c_requirements_wc
c_benefits_wc
c_title_tfidf_0
c_title_tfidf_1
c_title_tfidf_2
c_title_tfidf_3
c_title_tfidf_4
c_title_tfidf_5
c_title_tfidf_6
c_title_tfidf_7
c_title_tfidf_8
c_title_tfidf_9
c_title_tfidf_10
c_title_tfidf_11
c_title_tfidf_12
c_title_tfidf_13
c_title_tfidf_14
c_title_tfidf_15
c_title_tfidf_16
c_title_tfidf_17
c_title_tfidf_18
c_title_tfidf_19
c_title_tfidf_20
c_title_tfidf_21
c_title_tfidf_22
c_title_tfidf_23
c_title_tfidf_24
c_title_tfidf_25
c_title_tfidf_26
c_title_tfidf_27
c_title_tfidf_28
c_title_tfidf_29
c_title_tfidf_30
c_title_tfidf_31
c_title_tfidf_32
c_title_tfidf_33
c_title_tfidf_34
c_title_tfidf_35
c_title_tfidf_36
c_title_tfidf_37
c_title_tfidf_38
c_title_tfidf_39
c_title_tfidf_40
c_title_tfidf_41
c_title_tfidf_42
c_title_tfidf_43
c_title_tfidf_44
c_title_tfidf_45
c_title_tfidf_46
c_title_tfidf_47
c_title_tfidf_48
c_title_tfidf_49
c_title_tfid

0 AUC: 0.7587314439946019
Fold 2
1 AUC: 0.7622058018609743
Fold 3
2 AUC: 0.7407597628040082
AUC = 0.753899 +/- 0.009398


In [60]:
df_float = df
for col in df_float.columns:
    df_float[col] = df_float[col].astype(float)
#df['DataFrame Column'] = df['DataFrame Column'].astype(double)
df_float.dtypes

Unnamed__0                   float64
telecommuting                float64
has_company_logo             float64
has_questions                float64
fraudulent                   float64
                              ...   
Sciencefunction              float64
Strategy_Planningfunction    float64
Supply_Chainfunction         float64
Trainingfunction             float64
Writing_Editingfunction      float64
Length: 790, dtype: object

In [61]:
df_float.head()

Unnamed: 0,Unnamed__0,telecommuting,has_company_logo,has_questions,fraudulent,c_title_wc,c_company_profile_wc,c_description_wc,c_requirements_wc,c_benefits_wc,...,Public_Relationsfunction,Purchasingfunction,Quality_Assurancefunction,Researchfunction,Salesfunction,Sciencefunction,Strategy_Planningfunction,Supply_Chainfunction,Trainingfunction,Writing_Editingfunction
0,0.0,0.0,1.0,0.0,0.0,0.087912,0.140381,0.059421,0.081088,0.006138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.263736,0.187017,0.131836,0.129414,0.252103,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2.0,0.0,1.0,0.0,0.0,0.236264,0.132661,0.02233,0.133691,0.006138,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,1.0,0.0,0.0,0.203297,0.099575,0.172333,0.142064,0.206183,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4.0,0.0,1.0,1.0,0.0,0.120879,0.279502,0.101053,0.077175,0.006365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[4]") \
    .appName("svm_1") \
    .getOrCreate()

sparkDF=spark.createDataFrame(df_float) 
sparkDF.printSchema()
#sparkDF.show()

root
 |-- Unnamed__0: double (nullable = true)
 |-- telecommuting: double (nullable = true)
 |-- has_company_logo: double (nullable = true)
 |-- has_questions: double (nullable = true)
 |-- fraudulent: double (nullable = true)
 |-- c_title_wc: double (nullable = true)
 |-- c_company_profile_wc: double (nullable = true)
 |-- c_description_wc: double (nullable = true)
 |-- c_requirements_wc: double (nullable = true)
 |-- c_benefits_wc: double (nullable = true)
 |-- c_title_tfidf_0: double (nullable = true)
 |-- c_title_tfidf_1: double (nullable = true)
 |-- c_title_tfidf_2: double (nullable = true)
 |-- c_title_tfidf_3: double (nullable = true)
 |-- c_title_tfidf_4: double (nullable = true)
 |-- c_title_tfidf_5: double (nullable = true)
 |-- c_title_tfidf_6: double (nullable = true)
 |-- c_title_tfidf_7: double (nullable = true)
 |-- c_title_tfidf_8: double (nullable = true)
 |-- c_title_tfidf_9: double (nullable = true)
 |-- c_title_tfidf_10: double (nullable = true)
 |-- c_title_tfidf_

In [66]:
#Library that contains the functions for building vectors
from pyspark.ml.linalg import Vectors  
from pyspark.ml.feature import VectorAssembler 

In [80]:
#Created the feature vector

idd, target = "Unnamed__0", "fraudulent"
features = [f for f in df.columns if f not in [idd, target]]

vector_assembler = VectorAssembler(inputCols=["telecommuting","has_company_logo","c_company_profile_wc"], outputCol="features")
df_temp = vector_assembler.transform(sparkDF)
df_temp.show(5)

+----------+-------------+----------------+-------------+----------+-------------------+--------------------+--------------------+-------------------+--------------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+------

In [81]:
df_temp.printSchema()

root
 |-- Unnamed__0: double (nullable = true)
 |-- telecommuting: double (nullable = true)
 |-- has_company_logo: double (nullable = true)
 |-- has_questions: double (nullable = true)
 |-- fraudulent: double (nullable = true)
 |-- c_title_wc: double (nullable = true)
 |-- c_company_profile_wc: double (nullable = true)
 |-- c_description_wc: double (nullable = true)
 |-- c_requirements_wc: double (nullable = true)
 |-- c_benefits_wc: double (nullable = true)
 |-- c_title_tfidf_0: double (nullable = true)
 |-- c_title_tfidf_1: double (nullable = true)
 |-- c_title_tfidf_2: double (nullable = true)
 |-- c_title_tfidf_3: double (nullable = true)
 |-- c_title_tfidf_4: double (nullable = true)
 |-- c_title_tfidf_5: double (nullable = true)
 |-- c_title_tfidf_6: double (nullable = true)
 |-- c_title_tfidf_7: double (nullable = true)
 |-- c_title_tfidf_8: double (nullable = true)
 |-- c_title_tfidf_9: double (nullable = true)
 |-- c_title_tfidf_10: double (nullable = true)
 |-- c_title_tfidf_

In [82]:
from pyspark.sql.functions import col
df_formatted = df_temp.select(col("fraudulent"),col("features"))
df_formatted.printSchema()
df_formatted.show(5)

root
 |-- fraudulent: double (nullable = true)
 |-- features: vector (nullable = true)

+----------+--------------------+
|fraudulent|            features|
+----------+--------------------+
|       0.0|[0.0,1.0,0.140381...|
|       0.0|[0.0,1.0,0.187017...|
|       0.0|[0.0,1.0,0.132661...|
|       0.0|[0.0,1.0,0.099574...|
|       0.0|[0.0,1.0,0.279502...|
+----------+--------------------+
only showing top 5 rows



In [72]:
df_formatted.show(5)

+----------+--------------------+
|fraudulent|            features|
+----------+--------------------+
|       0.0|(788,[1,3,4,5,6,7...|
|       0.0|(788,[1,3,4,5,6,7...|
|       0.0|(788,[1,3,4,5,6,7...|
|       0.0|(788,[1,3,4,5,6,7...|
|       0.0|(788,[1,2,3,4,5,6...|
+----------+--------------------+
only showing top 5 rows



In [83]:
df_SVM=df_formatted.selectExpr('features',"fraudulent as label")
df_SVM.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,1.0,0.140381...|  0.0|
|[0.0,1.0,0.187017...|  0.0|
|[0.0,1.0,0.132661...|  0.0|
|[0.0,1.0,0.099574...|  0.0|
|[0.0,1.0,0.279502...|  0.0|
+--------------------+-----+
only showing top 5 rows



In [84]:
#Splits between training and testing data
(train, test) = df_SVM.randomSplit([0.7, 0.3])
train.show(10)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.0,0.0,0.005356...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
|[0.0,0.0,0.009925...|  0.0|
+--------------------+-----+
only showing top 10 rows



4) Defines the SVM Model

In [85]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel  #Library for SVM Model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator  #Used to find performance metrics
from pyspark.mllib.linalg import Vectors  #Dense vectors
from pyspark.mllib.util import MLUtils
df_train = MLUtils.convertVectorColumnsFromML(train, "features")
df_test = MLUtils.convertVectorColumnsFromML(test, "features")

In [86]:
df_train.show(5,False)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[0.0,0.0,0.005356861278414726]|0.0  |
|[0.0,0.0,0.00992594938725233] |0.0  |
|[0.0,0.0,0.00992594938725233] |0.0  |
|[0.0,0.0,0.00992594938725233] |0.0  |
|[0.0,0.0,0.00992594938725233] |0.0  |
+------------------------------+-----+
only showing top 5 rows



In [87]:
from pyspark.mllib.regression import LabeledPoint  #Creates the "line" (characteristics and label) to be used

trainingData = df_train.rdd.map(lambda row:LabeledPoint(row.label,row.features))  #Apply the label to the training
testingData = df_test.rdd.map(lambda row:LabeledPoint(row.label,row.features))  #Apply the label to the test

In [88]:
for xs in trainingData.take(10):
        print(xs)

(0.0,[0.0,0.0,0.005356861278414726])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])
(0.0,[0.0,0.0,0.00992594938725233])


In [None]:
#Model build
modelSVM = SVMWithSGD.train(trainingData, iterations=100)

In [None]:
#Performing the prediction
labelsAndPreds = testingData.map(lambda p: (p.label, modelSVM.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(testingData.count())
print("Error in prediction: ",trainErr)