# Log model

In [19]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import sklearn
import seaborn as sns

%matplotlib inline  
import matplotlib.pyplot as plt  
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, r2_score ,confusion_matrix

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier ,ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier ,ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


In [21]:
df = pd.read_csv('./../../Data/text.csv')
df

Unnamed: 0,text,fraudulent
0,food52 created groundbreaking award winning co...,f
1,90 seconds worlds cloud video production servi...,f
2,valor services provides workforce solutions me...,f
3,passion improving quality life geography heart...,f
4,spotsource solutions llc global human capital ...,f
...,...,...
17875,vend looking awesome new talent come join us w...,f
17876,weblinc e commerce platform services provider ...,f
17877,provide full time permanent positions many med...,f
17878,nemsia studios looking experienced visual grap...,f


# Bag of Words using CountVectorizer

In [22]:
df['fraudulent']=df['fraudulent'].replace('f', 0)
df['fraudulent']=df['fraudulent'].replace('t', 1)
df['fraudulent'].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [23]:
df1 = df[df['fraudulent']==0]
df1 = df1.sample(1000)
print(len(df1))
df2 = df[df['fraudulent']==1]

print(len(df2))

df_balanced = pd.concat([df1,df2],axis=0)

df_balanced = df_balanced.sample(frac=1) #shuffle all rows
df_balanced.value_counts()

1000
866


text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'], df_balanced['fraudulent'], test_size=0.25, random_state=42)


In [24]:
count_vector = CountVectorizer(ngram_range=(1, 1), lowercase = True , stop_words =  'english')

X_train = count_vector.fit_transform(X_train) 
X_test = count_vector.transform(X_test)

# Traditional Model

## Baseline

In [9]:

# Instantiate SMOTE
sm = SMOTE(random_state=42)

# Apply SMOTE to training data only
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
X_train_res[1]

<1x15626 sparse matrix of type '<class 'numpy.int64'>'
	with 171 stored elements in Compressed Sparse Row format>

In [196]:
list_of_models = [LogisticRegression() , KNeighborsClassifier() , SVC(kernel='rbf'), 
                  DecisionTreeClassifier() ,RandomForestClassifier(), MultinomialNB()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(X_train , y_train)
    for i in range(2) :
        if i == 0 :
            to_pred = X_train
            pred = y_train
            title = 'Train'
            
        else :
            to_pred = X_test
            pred = y_test
            title = 'Test'
        y_pred = model.predict(to_pred)
        acc = round(accuracy_score(pred , y_pred)*100, 3)
        f1 = round(f1_score(pred , y_pred)*100, 3)
        prec = round(precision_score(pred , y_pred)*100, 3)
        recall = round(recall_score(pred , y_pred)*100, 3)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall]).reshape(1,4) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model , title) } ,inplace=True )
pd.options.display.max_rows = 15
classification_report

Unnamed: 0,Accuracy,F1_score,Precision,Recall
LogisticRegression() _ Train Details,100.0,100.0,100.0,100.0
LogisticRegression() _ Test Details,91.863,91.593,90.393,92.825
KNeighborsClassifier() _ Train Details,74.196,77.592,64.566,97.201
KNeighborsClassifier() _ Test Details,71.306,76.325,62.974,96.861
SVC() _ Train Details,97.212,97.002,95.897,98.134
SVC() _ Test Details,90.15,89.64,90.045,89.238
DecisionTreeClassifier() _ Train Details,100.0,100.0,100.0,100.0
DecisionTreeClassifier() _ Test Details,85.439,85.153,82.979,87.444
RandomForestClassifier() _ Train Details,100.0,100.0,100.0,100.0
RandomForestClassifier() _ Test Details,91.863,91.284,93.427,89.238


## Logistic Regression

### Best Estimator

In [25]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score
logitic_regression = LogisticRegression()
     
skf = StratifiedKFold(n_splits=10)
param_grid = {
    'penalty' : ['l1', 'l2'],
    'C': [0.001, 0.01],
    'solver' : ['liblinear']}
grid_search_lr = GridSearchCV(logitic_regression,param_grid = param_grid , cv=skf,return_train_score=True)
grid_search_lr.fit(X_train, y_train)

model_lr = grid_search_lr.best_estimator_
model_lr.fit(X_train,y_train)
y_hat_lr = model_lr.predict(X_train)

print('Best Parameters: ', grid_search_lr.best_params_)
print('Best Train Accuracy : ', round(accuracy_score(y_hat_lr,y_train), 3))
print('Best Cross Validation Accuracy : ', round(grid_search_lr.best_score_, 3))

Best Parameters:  {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
Best Train Accuracy :  0.974
Best Cross Validation Accuracy :  0.909


### Prediction Score

In [36]:
# Fit best estimator
clf_lr = grid_search_lr.best_estimator_ 
clf_lr.fit(X_train, y_train)

# Predict training and testing set
y_pred_lr_train = clf_lr.predict(X_train)
y_pred_lr_test = clf_lr.predict(X_test)

# Prediction score
print('Hold-out Train Set Accuracy Score: ', round(accuracy_score(y_train, y_pred_lr_train), 3))
print('Hold-out Test Set Accuracy Score: ', round(accuracy_score(y_test, y_pred_lr_test), 3))
print('Hold-out Train Set Precision Score: ', round(precision_score(y_train, y_pred_lr_train), 3))
print('Hold-out Test Set Precision Score: ', round(precision_score(y_test, y_pred_lr_test), 3))
print('Hold-out Train Set Recall Score: ', round(recall_score(y_train, y_pred_lr_train), 3))
print('Hold-out Test Set Recall Score: ', round(recall_score(y_test, y_pred_lr_test), 3))

Hold-out Train Set Accuracy Score:  0.981
Hold-out Test Set Accuracy Score:  0.921
Hold-out Train Set Precision Score:  0.981
Hold-out Test Set Precision Score:  0.916
Hold-out Train Set Recall Score:  0.977
Hold-out Test Set Recall Score:  0.912


In [37]:
y_scores_lr = clf_lr.predict_proba(X_test)[:, 1]

In [38]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score

fpr_lr, tpr_lr, auc_thresholds_lr = roc_curve(y_test, y_scores_lr)
optimal_idx_lr = np.argmax(tpr_lr - fpr_lr)
optimal_idx_lr

30

In [41]:
optimal_threshold_lr = auc_thresholds_lr[30]
optimal_threshold_lr

0.4753123530282561

In [39]:
print(auc_thresholds_lr[30], 'is our optimized auc threshold \n', 
      tpr_lr[30], 'is the recall score (TP) at there \n',
      fpr_lr[30], 'is the false positive rate at there') 

0.4753123530282561 is our optimized auc threshold 
 0.9398148148148148 is the recall score (TP) at there 
 0.07569721115537849 is the false positive rate at there


In [43]:
y_scores_train_lr = clf_lr.predict_proba(X_train)[:, 1]
y_scores_test_lr = clf_lr.predict_proba(X_test)[:, 1]
new_pred_train_lr = (y_scores_train_lr   >= optimal_threshold_lr).astype(int)
new_pred_test_lr = (y_scores_test_lr  >= optimal_threshold_lr).astype(int)
# Prediction score
print('Hold-out Train Set Accuracy Score: ', round(accuracy_score(y_train,new_pred_train_lr), 3))
print('Hold-out Test Set Accuracy Score: ', round(accuracy_score(y_test, new_pred_test_lr), 3))
print('Hold-out Train Set Precision Score: ', round(precision_score(y_train,new_pred_train_lr), 3))
print('Hold-out Test Set Precision Score: ', round(precision_score(y_test, new_pred_test_lr), 3))
print('Hold-out Train Set Recall Score: ', round(recall_score(y_train, new_pred_train_lr), 3))
print('Hold-out Test Set Recall Score: ', round(recall_score(y_test, new_pred_test_lr), 3))
print('Hold-out Train Set F1 Score: ', round(f1_score(y_train, new_pred_train_lr), 3))
print('Hold-out Test Set F1 Score: ', round(f1_score(y_test, new_pred_test_lr), 3))

Hold-out Train Set Accuracy Score:  0.981
Hold-out Test Set Accuracy Score:  0.931
Hold-out Train Set Precision Score:  0.976
Hold-out Test Set Precision Score:  0.914
Hold-out Train Set Recall Score:  0.983
Hold-out Test Set Recall Score:  0.94
Hold-out Train Set F1 Score:  0.979
Hold-out Test Set F1 Score:  0.927


## Random Forest

### Best Estimator

In [27]:
# GridSearchCV for Random Forest

grid_rf = {'n_estimators': [10, 30, 50],
    'max_depth': [10, 20, 30],
    'min_samples_split': [5, 10, 15]}

from sklearn.ensemble import RandomForestClassifier 
clf_rf = RandomForestClassifier(random_state=42) 
grid_search_rf = GridSearchCV(clf_rf, grid_rf, cv = 5)
grid_search_rf.fit(X_train, y_train)
model_rf = grid_search_rf.best_estimator_
model_rf.fit(X_train,y_train)
y_hat_rf = model_rf.predict(X_train)

print('Best Parameters: ', grid_search_rf.best_params_)
print('Best Train Accuracy : ', round(accuracy_score(y_hat_rf,y_train), 3))
print('Best Cross Validation Accuracy : ', round(grid_search_rf.best_score_, 3))


KeyboardInterrupt: 

### Prediction Score

In [31]:
# # Fit best estimator
clf_rfb = grid_search_rf.best_estimator_ 
clf_rfb.fit(X_train, y_train)
# Predict training and testing set
y_pred_rfb_train = clf_rfb.predict(X_train)
y_pred_rfb_test = clf_rfb.predict(X_test)

# Prediction score
print('Hold-out Train Set Accuracy Score: ', round(accuracy_score(y_train, y_pred_rfb_train), 3))
print('Hold-out Test Set Accuracy Score: ', round(accuracy_score(y_test, y_pred_rfb_test), 3))
print('Hold-out Train Set Precision Score: ', round(precision_score(y_train, y_pred_rfb_train), 3))
print('Hold-out Test Set Precision Score: ', round(precision_score(y_test, y_pred_rfb_test), 3))
print('Hold-out Train Set Recall Score: ', round(recall_score(y_train, y_pred_rfb_train), 3))
print('Hold-out Test Set Recall Score: ', round(recall_score(y_test, y_pred_rfb_test), 3))

Hold-out Train Set Accuracy Score:  0.988
Hold-out Test Set Accuracy Score:  0.916
Hold-out Train Set Precision Score:  0.975
Hold-out Test Set Precision Score:  0.897
Hold-out Train Set Recall Score:  1.0
Hold-out Test Set Recall Score:  0.91


In [69]:
y_scores = clf_rfb.predict_proba(X_test)[:, 1]

In [70]:
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score

fpr, tpr, auc_thresholds = roc_curve(y_test, y_scores)
optimal_idx = np.argmax(tpr - fpr)
optimal_idx 

34

In [71]:
optimal_threshold = auc_thresholds[34]
optimal_threshold

0.4929287677032131

In [72]:
print(auc_thresholds[34], 'is our optimized auc threshold \n', 
      tpr[34], 'is the recall score (TP) at there \n',
      fpr[34], 'is the false positive rate at there') 
     

0.4929287677032131 is our optimized auc threshold 
 0.9212962962962963 is the recall score (TP) at there 
 0.035856573705179286 is the false positive rate at there


In [75]:
y_scores_train = clf_rfb.predict_proba(X_train)[:, 1]
y_scores_test = clf_rfb.predict_proba(X_test)[:, 1]
new_pred_train = (y_scores_train   >= optimal_threshold).astype(int)
new_pred_test = (y_scores_test  >= optimal_threshold).astype(int)
# Prediction score
print('Hold-out Train Set Accuracy Score: ', round(accuracy_score(y_train,new_pred_train), 3))
print('Hold-out Test Set Accuracy Score: ', round(accuracy_score(y_test, new_pred_test), 3))
print('Hold-out Train Set Precision Score: ', round(precision_score(y_train,new_pred_train), 3))
print('Hold-out Test Set Precision Score: ', round(precision_score(y_test, new_pred_test), 3))
print('Hold-out Train Set Recall Score: ', round(recall_score(y_train, new_pred_train), 3))
print('Hold-out Test Set Recall Score: ', round(recall_score(y_test, new_pred_test), 3))
print('Hold-out Train Set f1 Score: ', round(f1_score(y_train, new_pred_train), 3))
print('Hold-out Test Set f1 Score: ', round(f1_score(y_test, new_pred_test), 3))

Hold-out Train Set Accuracy Score:  0.987
Hold-out Test Set Accuracy Score:  0.944
Hold-out Train Set Precision Score:  0.976
Hold-out Test Set Precision Score:  0.957
Hold-out Train Set Recall Score:  0.997
Hold-out Test Set Recall Score:  0.921
Hold-out Train Set f1 Score:  0.986
Hold-out Test Set f1 Score:  0.939


In [10]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
# set the model
nb_model = MultinomialNB()

# fit the dataset
nb_model.fit(X_train, y_train)
nb_predict = nb_model.predict(X_test)

NameError: name 'X_train' is not defined

In [33]:
#Accuracy score for tfidf features
print("Accuracy  {:.3} %".format(accuracy_score(y_test, nb_predict)*100))
print("Precision  {:.3} %".format(precision_score(y_test, nb_predict)*100))
print("Recall  {:.3} %".format(recall_score(y_test, nb_predict)*100))
print("f1_score  {:.3} %".format(f1_score(y_test, nb_predict)*100))

Accuracy  96.6 %
Precision  63.7 %
Recall  76.2 %
f1_score  69.4 %


In [36]:
from sklearn.svm import SVC
# fit data using SVC
svc = SVC(kernel='rbf')
svc.fit(X_train, y_train)

In [37]:
# predict output of the test data set
predicted = svc.predict(X_test)

In [38]:
#Accuracy score for tfidf features
print("Accuracy  {:.3} %".format(accuracy_score(y_test, predicted)*100))
print("Precision  {:.3} %".format(precision_score(y_test, predicted)*100))
print("Recall  {:.3} %".format(recall_score(y_test, predicted)*100))
print("f1_score  {:.3} %".format(f1_score(y_test, predicted)*100))

Accuracy  96.0 %
Precision  1e+02 %
Recall  19.3 %
f1_score  32.3 %


In [51]:
def words_in_texts(words, texts):
    indicator_array = 1 * np.array([texts.str.contains(word) for word in words]).T
    return indicator_array

In [86]:
eda_ham = df.loc[df['fraudulent']==0]
eda_spam = df.loc[df['fraudulent']==1]

num_ham, num_spam = {}, {}

ham_split = eda_ham['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()
spam_split = eda_spam['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()

#put word frequencies in dictionaries
for i in ham_split:
    for j in i:
        if num_ham.get(j) is None:
            num_ham[j] = 1
        num_ham[j] = num_ham[j] + 1
for i in spam_split:
    for j in i:
        if num_spam.get(j) is None:
            num_spam[j] = 1
        num_spam[j] = num_spam[j] + 1
        
#sorted_ham = sorted(num_ham, key = num_ham.get, reverse = True)
sorted_spam = sorted(num_spam, key = num_spam.get, reverse = True)
len(sorted_spam) #95402 pairs in the dictionary

feature = []
for i in np.arange(1200):
    feature.append(sorted_spam[i])

  ham_split = eda_ham['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()
  spam_split = eda_spam['text'].str.replace(r'/<[^>]*>/g', ' ').str.split()


In [87]:
len(feature)

1200

In [88]:
arr_feature = ['work', 'experience', 'time', 'skills', 'amp', 'us', 'full', 'company', 'team', 'service', 'management', 
               'business', 'customer', 'ability', 'services', 'position', 'engineering', 'level', 'high', 'data', 'project',
               'entry', 'industry', 'required', 'environment', 'new', 'must', 'solutions', 'years', 'job', 'support', 'development', 
               'products', 'knowledge', 'working', 'systems', 'looking', 'information', 'provide', 'office', 'within', 'benefits',
               'candidates', 'people', 'product', 'requirements', 'sales', 'including', 'equipment', 'process', 'oil', 'communication', 
               'strong', 'technology', 'design', 'degree', 'customers', 'able', 'per', 'home', 'manager', 'training', 'quality', 
               'technical', 'false', 'good', 'professional', '1', 'opportunity', 'computer', 'school', '000', 'apply', 'develop',
               'well', 'responsibilities', 'administrative', 'ensure', 'excellent', 'part', 'help', '2', 'system', 'field', 'employees',
               'duties', 'perform', 'equivalent', 'get', 'please', 'client', 'world', 'responsible', 'gas', 'needed', 'test', 'operations', 
               'maintain', 'software', 'projects', 'production', 'preferred', 'ca', 'maintenance', 'related', 'positions', 'clients', 
               'offer', 'global', 'aker', 'contract', 'food', 'program', 'based', '3', 'start', 'bonus', 'paid']

train_X = words_in_texts(arr_feature, df['text'])
train_Y = np.array(df['fraudulent'])
model_eda = LogisticRegression()
model_eda.fit(train_X, train_Y)

pred = model_eda.predict(train_X)
new_training_accuracy = model_eda.score(train_X, train_Y)

# print("Accuracy: ", new_training_accuracy)
print('Accuracy score:' , accuracy_score(pred, train_Y))
print('Precision score:', precision_score(pred, train_Y))
print ('Recall score:', recall_score(pred, train_Y))
print ('F1 score:', f1_score(pred, train_Y))

Accuracy score: 0.9603467561521253
Precision score: 0.2690531177829099
Recall score: 0.7540453074433657
F1 score: 0.39659574468085107


In [89]:
num_words = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]

accuracy_score_lst = []
precision_score_lst = []
recall_score_lst = []
f1_score_lst = []

for num in num_words:
    arr_feature = feature[:num]
    
    train_X = words_in_texts(arr_feature, train_X)
    train_Y = np.array(df['fraudulent'])
    
    model_eda = LogisticRegression(max_iter=10000)
    model_eda.fit(train_X, train_Y)

    pred = model_eda.predict(train_X)
    new_training_accuracy = model_eda.score(train_X, train_Y)

    # print("Accuracy: ", new_training_accuracy)
    print(num, 'words')
    print('Accuracy score:' , accuracy_score(pred, train_Y))
    print('Precision score:', precision_score(pred, train_Y))
    print ('Recall score:', recall_score(pred, train_Y))
    print ('F1 score:', f1_score(pred, train_Y))
    
    accuracy_score_lst.append(accuracy_score(pred, train_Y))
    precision_score_lst.append(precision_score(pred, train_Y))
    recall_score_lst.append(recall_score(pred, train_Y))
    f1_score_lst.append(f1_score(pred, train_Y))

50 words
Accuracy score: 0.9513982102908277
Precision score: 0.018475750577367205
Recall score: 0.45714285714285713
F1 score: 0.03551609322974473
100 words
Accuracy score: 0.9591163310961969
Precision score: 0.23672055427251731
Recall score: 0.7454545454545455
F1 score: 0.35933391761612615
200 words
Accuracy score: 0.9665548098434005
Precision score: 0.41454965357967666
Recall score: 0.7977777777777778
F1 score: 0.5455927051671733
300 words
Accuracy score: 0.9741610738255033
Precision score: 0.5623556581986143
Recall score: 0.8543859649122807
F1 score: 0.6782729805013928
400 words
Accuracy score: 0.9787472035794184
Precision score: 0.6593533487297921
Recall score: 0.8704268292682927
F1 score: 0.7503285151116951
500 words
Accuracy score: 0.9824384787472036
Precision score: 0.7193995381062356
Recall score: 0.8976945244956772
F1 score: 0.7987179487179488
600 words
Accuracy score: 0.9858501118568233
Precision score: 0.7609699769053118
Recall score: 0.9347517730496454
F1 score: 0.8389560789

In [90]:
pd.DataFrame({'Accuracy':accuracy_score_lst,
              'Precision':precision_score_lst,
              'Recall':recall_score_lst,
              'F1':f1_score_lst,
             }, index=num_words)

Unnamed: 0,Accuracy,Precision,Recall,F1
50,0.951398,0.018476,0.457143,0.035516
100,0.959116,0.236721,0.745455,0.359334
200,0.966555,0.41455,0.797778,0.545593
300,0.974161,0.562356,0.854386,0.678273
400,0.978747,0.659353,0.870427,0.750329
500,0.982438,0.7194,0.897695,0.798718
600,0.98585,0.76097,0.934752,0.838956
700,0.987752,0.790993,0.947441,0.862177
800,0.989653,0.819861,0.960758,0.884735
900,0.991611,0.849885,0.973545,0.907522


In [95]:
num_words = [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]

accuracy_score_lst = []
precision_score_lst = []
recall_score_lst = []
f1_score_lst = []

for num in num_words:
    arr_feature = feature[:num]
    
    train_X = words_in_texts(arr_feature, X_train)
    train_Y = np.array(y_train)
    
    test_x = words_in_texts(arr_feature, X_test)
    test_y = np.array(y_test)
    
    model_eda = LogisticRegression(max_iter=10000)
    model_eda.fit(train_X, train_Y)

    pred = model_eda.predict(test_x)
    new_training_accuracy = model_eda.score(train_X, train_Y)

    # print("Accuracy: ", new_training_accuracy)
    print(num, 'words')
    print('Accuracy score:' , accuracy_score(pred, y_test))
    print('Precision score:', precision_score(pred, y_test))
    print ('Recall score:', recall_score(pred, y_test))
    print ('F1 score:', f1_score(pred, y_test))
    
    accuracy_score_lst.append(accuracy_score(pred, y_test))
    precision_score_lst.append(precision_score(pred, y_test))
    recall_score_lst.append(recall_score(pred, y_test))
    f1_score_lst.append(f1_score(pred, y_test))


50 words
Accuracy score: 0.9496644295302014
Precision score: 0.017937219730941704
Recall score: 0.4
F1 score: 0.034334763948497854
100 words
Accuracy score: 0.9579418344519016
Precision score: 0.242152466367713
Recall score: 0.7397260273972602
F1 score: 0.36486486486486486
200 words
Accuracy score: 0.9635346756152126
Precision score: 0.3901345291479821
Recall score: 0.7631578947368421
F1 score: 0.5163204747774481
300 words
Accuracy score: 0.9691275167785235
Precision score: 0.5112107623318386
Recall score: 0.7972027972027972
F1 score: 0.6229508196721312
400 words
Accuracy score: 0.9713646532438479
Precision score: 0.547085201793722
Recall score: 0.8187919463087249
F1 score: 0.6559139784946236
500 words
Accuracy score: 0.9727069351230425
Precision score: 0.6457399103139013
Recall score: 0.7700534759358288
F1 score: 0.7024390243902439
600 words
Accuracy score: 0.9751677852348993
Precision score: 0.6771300448430493
Recall score: 0.7947368421052632
F1 score: 0.7312348668280871
700 words
Ac

In [96]:
pd.DataFrame({'Accuracy':accuracy_score_lst,
              'Precision':precision_score_lst,
              'Recall':recall_score_lst,
              'F1':f1_score_lst,
             }, index=num_words)

Unnamed: 0,Accuracy,Precision,Recall,F1
50,0.949664,0.017937,0.4,0.034335
100,0.957942,0.242152,0.739726,0.364865
200,0.963535,0.390135,0.763158,0.51632
300,0.969128,0.511211,0.797203,0.622951
400,0.971365,0.547085,0.818792,0.655914
500,0.972707,0.64574,0.770053,0.702439
600,0.975168,0.67713,0.794737,0.731235
700,0.974944,0.668161,0.796791,0.726829
800,0.975839,0.67713,0.807487,0.736585
900,0.978747,0.713004,0.836842,0.769976


In [99]:
df.head(22)

Unnamed: 0,text,fraudulent
0,food52 created groundbreaking award winning co...,0
1,90 seconds worlds cloud video production servi...,0
2,valor services provides workforce solutions me...,0
3,passion improving quality life geography heart...,0
4,spotsource solutions llc global human capital ...,0
5,job overview apex environmental consulting fir...,0
6,founded 2009 fonpit ag rose international web ...,0
7,airenvy mission provide lucrative yet hassle f...,0
8,solutions3 woman owned small business whose fo...,0
9,novitex enterprise solutions formerly pitney b...,0
