In [26]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import sklearn
import seaborn as sns

%matplotlib inline  
import matplotlib.pyplot as plt  
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.formula.api as smf


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [20]:
df = pd.read_csv('./../../Data/text.csv')
df.fraudulent=df.fraudulent.replace('f',0)
df.fraudulent=df.fraudulent.replace('t',1)
df

Unnamed: 0,text,fraudulent
0,food52 created groundbreaking award winning co...,0
1,90 seconds worlds cloud video production servi...,0
2,valor services provides workforce solutions me...,0
3,passion improving quality life geography heart...,0
4,spotsource solutions llc global human capital ...,0
...,...,...
17875,vend looking awesome new talent come join us w...,0
17876,weblinc e commerce platform services provider ...,0
17877,provide full time permanent positions many med...,0
17878,nemsia studios looking experienced visual grap...,0


# Bag of Words using Top Words

In [21]:
# words_in_text function that would calculatethe frequnecy of the word
def words_in_texts(words, texts):
    indicator_array = 1 * np.array([texts.str.contains(word) for word in words]).T
    return indicator_array

In [23]:
# Create bag of word dictionary

eda_ham = df.loc[df['fraudulent']==0]
eda_spam = df.loc[df['fraudulent']==1]

num_ham, num_spam = {}, {}

ham_split = eda_ham['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()
spam_split = eda_spam['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()

#put word frequencies in dictionaries
for i in ham_split:
    for j in i:
        if num_ham.get(j) is None:
            num_ham[j] = 1
        num_ham[j] = num_ham[j] + 1
for i in spam_split:
    for j in i:
        if num_spam.get(j) is None:
            num_spam[j] = 1
        num_spam[j] = num_spam[j] + 1
        
#sorted_ham = sorted(num_ham, key = num_ham.get, reverse = True)
sorted_spam = sorted(num_spam, key = num_spam.get, reverse = True)
print(len(sorted_spam)) #95402 pairs in the dictionary

# appended the list of words by the number occurences in the spam set 
feature = []
for i in np.arange(1200):
    feature.append(sorted_spam[i])

  ham_split = eda_ham['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()
  spam_split = eda_spam['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()


9728


In [27]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['fraudulent'], test_size = 0.25, random_state = 42) 

num_words = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]

accuracy_score_lst = []
precision_score_lst = []
recall_score_lst = []
f1_score_lst = []

for num in num_words:
    arr_feature = feature[:num]
    
    train_X = words_in_texts(arr_feature, X_train)
    train_Y = np.array(y_train)
    
    test_x = words_in_texts(arr_feature, X_test)
    test_y = np.array(y_test)
    
    model_eda = LogisticRegression(max_iter=10000)
    model_eda.fit(train_X, train_Y)

    pred = model_eda.predict(test_x)
    new_training_accuracy = model_eda.score(train_X, train_Y)

    # print("Accuracy: ", new_training_accuracy)
    print(num, 'words')
    print('Accuracy score:' , accuracy_score(pred, y_test))
    print('Precision score:', precision_score(pred, y_test))
    print ('Recall score:', recall_score(pred, y_test))
    print ('F1 score:', f1_score(pred, y_test))
    
    accuracy_score_lst.append(accuracy_score(pred, y_test))
    precision_score_lst.append(precision_score(pred, y_test))
    recall_score_lst.append(recall_score(pred, y_test))
    f1_score_lst.append(f1_score(pred, y_test))


50 words
Accuracy score: 0.9496644295302014
Precision score: 0.017937219730941704
Recall score: 0.4
F1 score: 0.034334763948497854
100 words
Accuracy score: 0.9579418344519016
Precision score: 0.242152466367713
Recall score: 0.7397260273972602
F1 score: 0.36486486486486486
200 words
Accuracy score: 0.9635346756152126
Precision score: 0.3901345291479821
Recall score: 0.7631578947368421
F1 score: 0.5163204747774481
300 words
Accuracy score: 0.9691275167785235
Precision score: 0.5112107623318386
Recall score: 0.7972027972027972
F1 score: 0.6229508196721312
400 words
Accuracy score: 0.9713646532438479
Precision score: 0.547085201793722
Recall score: 0.8187919463087249
F1 score: 0.6559139784946236
500 words
Accuracy score: 0.9727069351230425
Precision score: 0.6457399103139013
Recall score: 0.7700534759358288
F1 score: 0.7024390243902439
600 words
Accuracy score: 0.9751677852348993
Precision score: 0.6771300448430493
Recall score: 0.7947368421052632
F1 score: 0.7312348668280871
700 words
Ac

In [29]:
top_words_result = pd.DataFrame({'Accuracy':accuracy_score_lst,'Precision':precision_score_lst,
                                 'Recall':recall_score_lst,'F1':f1_score_lst,}, index=num_words)
print(top_words_result.round(3))

      Accuracy  Precision  Recall     F1
50       0.950      0.018   0.400  0.034
100      0.958      0.242   0.740  0.365
200      0.964      0.390   0.763  0.516
300      0.969      0.511   0.797  0.623
400      0.971      0.547   0.819  0.656
500      0.973      0.646   0.770  0.702
600      0.975      0.677   0.795  0.731
700      0.975      0.668   0.797  0.727
800      0.976      0.677   0.807  0.737
900      0.979      0.713   0.837  0.770
1000     0.978      0.713   0.824  0.764
1100     0.979      0.726   0.827  0.773
1200     0.978      0.717   0.825  0.767


# Bag of Words using CountVectorizer (all words in Training set)

In [271]:
df['fraudulent']=df['fraudulent'].replace('f', 0)
df['fraudulent']=df['fraudulent'].replace('t', 1)
df['fraudulent'].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [273]:
# Split into Train and Testing Dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['fraudulent'], test_size = 0.25, random_state = 42) 

In [274]:
# Function that would train and predict testing data.
# Takes in x, y for train and test set, and a tuple for ngram. 
def count_vect(x_train, x_test, Y_train, Y_test, ngram):
    count_vector = CountVectorizer(ngram_range= ngram, lowercase = True , stop_words =  'english')

    X_train = count_vector.fit_transform(x_train) 
    X_test = count_vector.transform(x_test)
    
    logitic_regression = LogisticRegression(random_state=42, max_iter=10000)
    logitic_regression.fit(X_train , Y_train)

    predictions_logistic_regression = logitic_regression.predict(X_test)
    
    print('Accuracy score:' , accuracy_score(predictions_logistic_regression, y_test))
    print('Precision score:', precision_score(predictions_logistic_regression, y_test))
    print ('Recall score:', recall_score(predictions_logistic_regression, y_test))
    print ('F1 score:', f1_score(predictions_logistic_regression, y_test))
    
    unique_elements, counts_elements = np.unique(predictions_logistic_regression, return_counts=True)
    print("Frequency of unique values of the said array:")
    print(np.asarray((unique_elements, counts_elements)))

In [275]:
# n-gram = 1,1
count_vect(X_train, X_test, y_train, y_test, (1,1))

Accuracy score: 0.9861297539149888
Precision score: 0.7623318385650224
Recall score: 0.9497206703910615
F1 score: 0.845771144278607
Frequency of unique values of the said array:
[[   0    1]
 [4291  179]]


In [276]:
# n-gram = 1,2
count_vect(X_train, X_test, y_train, y_test, (1,2))

Accuracy score: 0.9870246085011186
Precision score: 0.7488789237668162
Recall score: 0.9881656804733728
F1 score: 0.8520408163265306
Frequency of unique values of the said array:
[[   0    1]
 [4301  169]]


In [277]:
# n-gram = 2,2
count_vect(X_train, X_test, y_train, y_test, (2,2))

Accuracy score: 0.9841163310961969
Precision score: 0.6816143497757847
Recall score: 1.0
F1 score: 0.8106666666666666
Frequency of unique values of the said array:
[[   0    1]
 [4318  152]]


# Feature Engineering Approach

In [267]:
master_df = pd.read_csv('emscad_v1.csv')

In [268]:
#knn, forest, trees
#company_profile	description	requirements	benefits	

In [224]:
# Takes in the original dataset dataframe and conducts feature engineering on the text
# evaluates the number of characters, words, and occurences of specific symbols. 

def method_one_text_features(df2, train_or_test):
    df_new = df2.copy()
    df_new['company_profile'] = df_new['company_profile'].str.lower()
    df_new['company_profile_count_chr'] = df_new['company_profile'].str.len()
    df_new['company_profile_count_word'] = df_new['company_profile'].str.split(' ').str.len()
    
    df_new['description'] = df_new['description'].str.lower()
    df_new['description_profile_count_chr'] = df_new['description'].str.len()
    df_new['description_profile_count_word'] = df_new['company_profile'].str.split(' ').str.len()
    
    df_new['requirements'] = df_new['requirements'].str.lower()
    df_new['requirements_count_chr'] = df_new['requirements'].str.len()
    df_new['requirements_profile_count_word'] = df_new['company_profile'].str.split(' ').str.len()
    
    df_new['benefits'] = df_new['benefits'].str.lower()
    df_new['benefits_count_chr'] = df_new['benefits'].str.len()
    df_new['benefits_profile_count_word'] = df_new['company_profile'].str.split(' ').str.len()

    df_new['excl_count'] = df_new['company_profile'].str.count('!') + df_new['description'].str.count('!') + df_new['requirements'].str.count('!') + df_new['benefits'].str.count('!')
    df_new['q_count'] =  df_new['company_profile'].str.count('\?') + df_new['description'].str.count('\?') + df_new['requirements'].str.count('\?') + df_new['benefits'].str.count('\?') 
    df_new['hash_count'] = df_new['company_profile'].str.count('#') + df_new['description'].str.count('#') + df_new['requirements'].str.count('#') + df_new['benefits'].str.count('#')
    df_new['dollar_count'] = df_new['company_profile'].str.count('$') + df_new['description'].str.count('$') + df_new['requirements'].str.count('$') + df_new['benefits'].str.count('$')
    df_new['newline_count'] = df_new['company_profile'].str.count('\n') + df_new['description'].str.count('\n') + df_new['requirements'].str.count('\n') + df_new['benefits'].str.count('\n')
    df_new['bracket_count'] =  df_new['company_profile'].str.count(r'\<.*\>') + df_new['description'].str.count(r'\<.*\>') + df_new['requirements'].str.count(r'\<.*\>') + df_new['benefits'].str.count(r'\<.*\>')

    df_new = df_new.fillna(0)

    # dummy_df = pd.get_dummies(data=df_new, columns=['telecommuting','has_company_logo', 'has_questions','employment_type','required_experience','required_education','industry','function'])
    # df_new = pd.concat([df_new.drop(columns=['telecommuting','has_company_logo', 'has_questions','employment_type','required_experience', 'required_education','industry','function']), dummy_df], axis=1)
    # print(list(df_new.columns))
    
    if train_or_test == 'train':
        df_new = df_new.drop(['telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function','title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'in_balanced_dataset',
                             'company_profile_count_chr', 'description_profile_count_chr', 'requirements_count_chr', 'benefits_count_chr'], axis=1)
    else: 
        df_new = df_new.drop(['telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function','title', 'location', 'department', 'salary_range', 'company_profile', 'description', 'requirements', 'benefits', 'in_balanced_dataset',
                             'company_profile_count_chr', 'description_profile_count_chr', 'requirements_count_chr', 'benefits_count_chr'], axis=1)
    return df_new
    

In [225]:
master_df_no_y = master_df.drop('fraudulent' ,axis =1)
master_df['fraudulent']=master_df['fraudulent'].replace('f', 0)
master_df['fraudulent']=master_df['fraudulent'].replace('t', 1)

In [226]:
X_train, X_test, y_train, y_test = train_test_split(master_df_no_y, master_df['fraudulent'], test_size=0.25, random_state=42)

In [229]:
train_X_fet = method_one_text_features(X_train, 'train')
train_Y_fet = np.array(y_train)

test_x = method_one_text_features(X_test, 'test')
test_y = np.array(y_test)

model_eda = LogisticRegression(max_iter=10000)
model_eda.fit(train_X_fet, train_Y_fet)
new_training_accuracy = model_eda.score(train_X_fet, train_Y_fet)


print('Train Accruacy:', new_training_accuracy)
pred = model_eda.predict(test_x)

print('Test Stats')   
print('Accuracy score:' , accuracy_score(pred, test_y))
print('Precision score:', precision_score(pred, test_y))
print ('Recall score:', recall_score(pred, test_y))
print ('F1 score:', f1_score(pred, test_y))

Train Accruacy: 0.9515287099179717
Test Stats
Accuracy score: 0.9494407158836689
Precision score: 0.0
Recall score: 0.0
F1 score: 0.0


# Addtional Extra Code

In [None]:
# extra
arr_feature = ['work', 'experience', 'time', 'skills', 'amp', 'us', 'full', 'company', 'team', 'service', 'management', 
               'business', 'customer', 'ability', 'services', 'position', 'engineering', 'level', 'high', 'data', 'project',
               'entry', 'industry', 'required', 'environment', 'new', 'must', 'solutions', 'years', 'job', 'support', 'development', 
               'products', 'knowledge', 'working', 'systems', 'looking', 'information', 'provide', 'office', 'within', 'benefits',
               'candidates', 'people', 'product', 'requirements', 'sales', 'including', 'equipment', 'process', 'oil', 'communication', 
               'strong', 'technology', 'design', 'degree', 'customers', 'able', 'per', 'home', 'manager', 'training', 'quality', 
               'technical', 'false', 'good', 'professional', '1', 'opportunity', 'computer', 'school', '000', 'apply', 'develop',
               'well', 'responsibilities', 'administrative', 'ensure', 'excellent', 'part', 'help', '2', 'system', 'field', 'employees',
               'duties', 'perform', 'equivalent', 'get', 'please', 'client', 'world', 'responsible', 'gas', 'needed', 'test', 'operations', 
               'maintain', 'software', 'projects', 'production', 'preferred', 'ca', 'maintenance', 'related', 'positions', 'clients', 
               'offer', 'global', 'aker', 'contract', 'food', 'program', 'based', '3', 'start', 'bonus', 'paid']

train_X = words_in_texts(arr_feature, df['text'])
train_Y = np.array(df['fraudulent'])
model_eda = LogisticRegression()
model_eda.fit(train_X, train_Y)

pred = model_eda.predict(train_X)
new_training_accuracy = model_eda.score(train_X, train_Y)

# print("Accuracy: ", new_training_accuracy)
print('Accuracy score:' , accuracy_score(pred, train_Y))
print('Precision score:', precision_score(pred, train_Y))
print ('Recall score:', recall_score(pred, train_Y))
print ('F1 score:', f1_score(pred, train_Y))

In [None]:
num_words = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]

accuracy_score_lst = []
precision_score_lst = []
recall_score_lst = []
f1_score_lst = []

for num in num_words:
    arr_feature = feature[:num]
    
    train_X = words_in_texts(arr_feature, X_train)
    train_Y = np.array(y_train)
    
    test_x = words_in_texts(arr_feature, X_test)
    test_y = np.array(y_test)
    
    model_eda = LogisticRegression(max_iter=10000)
    model_eda.fit(train_X, train_Y)

    pred = model_eda.predict(test_x)
    new_training_accuracy = model_eda.score(train_X, train_Y)

    # print("Accuracy: ", new_training_accuracy)
    print(num, 'words')
    print('Accuracy score:' , accuracy_score(pred, y_test))
    print('Precision score:', precision_score(pred, y_test))
    print ('Recall score:', recall_score(pred, y_test))
    print ('F1 score:', f1_score(pred, y_test))
    
    accuracy_score_lst.append(accuracy_score(pred, y_test))
    precision_score_lst.append(precision_score(pred, y_test))
    recall_score_lst.append(recall_score(pred, y_test))
    f1_score_lst.append(f1_score(pred, y_test))


In [None]:
# Select top n-number of words to test out.
num_words = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]

accuracy_score_lst = []
precision_score_lst = []
recall_score_lst = []
f1_score_lst = []


# loops over the number of words choices and train and test the model.
for num in num_words:
    arr_feature = feature[:num]
    
    train_X = words_in_texts(arr_feature, X_train)
    train_Y = np.array(y_train)
    
    model_eda = LogisticRegression(max_iter=10000)
    model_eda.fit(train_X, train_Y)

    pred = model_eda.predict(X_test)
    new_training_accuracy = model_eda.score(train_X, train_Y)

    # print("Accuracy: ", new_training_accuracy)
    print(num, 'words')
    print('Accuracy score:' , accuracy_score(pred, train_Y))
    print('Precision score:', precision_score(pred, train_Y))
    print ('Recall score:', recall_score(pred, train_Y))
    print ('F1 score:', f1_score(pred, train_Y))
    
    accuracy_score_lst.append(accuracy_score(pred, train_Y))
    precision_score_lst.append(precision_score(pred, train_Y))
    recall_score_lst.append(recall_score(pred, train_Y))
    f1_score_lst.append(f1_score(pred, train_Y))