# Trainer for fakes jobs detection

## Initialisation

In [1]:
# !pip install pandas scikit-learn nltk
# !pip install xgboost

In [3]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn import svm

nltk.download('stopwords')
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alain\AppData\Roaming\nltk_data...


True

## Preprocessing

In [4]:
def cleandataset(df):
    df["description"] = df["description"].str.replace("&amp;", "&", regex=False)
    df["description"] = df["description"].str.replace("\xa0", " ", regex=False)
    df["description"] = df['description'].str.replace("!", "! ", regex=False)
    df["description"] = df['description'].str.replace("?", "? ", regex=False)
    df["description"] = df['description'].str.replace(":", " ", regex=False)
    df["description"] = df['description'].str.replace("...", " ", regex=False)
    df["description"] = df['description'].str.replace("  +", " ", regex=True)
    df["description"] = df['description'].str.replace("([a-z]{2,})([A-Z])", "\g<1> \g<2>", regex=True)
    df["description"] = df['description'].str.replace("([a-z\.]{2,})([A-Z])", "\g<1> \g<2>", regex=True)
    df["description"] = df['description'].str.lower()
    
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    return df

def preprocess(text):
    if type(text) is float:
        return ""
    text = text.replace("&amp;", "&")
    text = text.replace("!", "! ")
    text = text.replace("\xa0", " ")
    text = text.replace("?", "? ")
    text = text.replace(":", " ")
    text = text.replace("...", " ")
    text = text.replace("  +", " ")
    text = re.sub( r"([a-z\.]{2,})([A-Z])", r"\g<1> \g<2>", text)
    text = re.sub(r"([a-z]{2,})([A-Z])", r"\g<1> \g<2>", text)
    text = text.lower()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
 
    clean_description = ' '.join(tokens)     
    return clean_description

def clean_data(text):
    if type(text) is float:
        return ""
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
 
    clean_description = ' '.join(tokens)     
    return clean_description

## Trainer

### Function reusable

In [5]:
def trainer(df, model, df_fraud, df_true, frac: float):
    """
    :var df: pd.DataFrame with all datas
    :var model: model of ML to use
    :var df_fraud and df_true: df splitted with true and false jobs separated
    :var frac: % of df_true to use for training
    """
    
    # Split the df_true to re balance the dataset
    print(f"\nfrac = {frac}")
    df_true_sample = df_true.sample(frac=frac, random_state=0)

    df_reshape = pd.concat([df_fraud, df_true_sample])
    # df_reshape = df_reshape.sample(frac=1)
    
    # check class distribution
    print("re shape")
    print(df_reshape['fraudulent'].value_counts(normalize = True))
    
    vectorizer = TfidfVectorizer(
        max_features = 50000, 
        lowercase=False , 
        ngram_range=(1,3))
    
    X = df_reshape.description
    y = df_reshape.fraudulent

    train_X , test_X , train_y , test_y = train_test_split(X , y , test_size = 0.2 , random_state = 0)
    
    # Vectorizer
    vec = vectorizer.fit(train_X)
    
    vec_train = vec.transform(train_X)
    vec_train = vec_train.toarray()

    vec_test = vectorizer.transform(test_X).toarray()

    train_data = pd.DataFrame(vec_train , columns=vectorizer.get_feature_names())
    test_data = pd.DataFrame(vec_test , columns= vectorizer.get_feature_names())

    # Training of the model selected in input
    model.fit(train_data, train_y)
    predictions  = model.predict(test_data)

    print(classification_report(test_y , predictions))
    # confusion matrix
    print(pd.crosstab(test_y, predictions), end="\n\n")
    
    # return a dict with informations and the models (ML + vectorizer)
    return {frac: {"model": model, "vectorizer": vec}}

## Selected model

In [122]:
# Modèle sélectionné
# frac = 12
# vect avec 50000 features
# SVM non fine-tuned - the defaults values are goods.

mymodel = models[4][0.07]
print(mymodel)

text = [preprocess("IC&amp;E Technician | Bakersfield, CA Mt. PosoPrincipal Duties and Responsibilities: Calibrates, tests, maintains, troubleshoots, and installs all power plant instrumentation, control systems and electrical equipment.Performs maintenance on motor control centers, motor operated valves, generators, excitation equipment and motors.Performs preventive, predictive and corrective maintenance on equipment, coordinating work with various team members.Designs and installs new equipment and/or system modifications.Troubleshoots and performs maintenance on DC backup power equipment, process controls, programmable logic controls (PLC), and emission monitoring equipment.Uses maintenance reporting system to record time and material use, problem identified and corrected, and further action required; provides complete history of maintenance on equipment.Schedule, coordinate, work with and monitor contractors on specific tasks, as required.Follows safe working practices at all times.Identifies safety hazards and recommends solutions.Follows environmental compliance work practices.Identifies environmental non-compliance problems and assist in implementing solutions.Assists other team members and works with all departments to support generating station in achieving their performance goals.Trains other team members in the areas of instrumentation, control, and electrical systems.Performs housekeeping assignments, as directed.Conduct equipment and system tagging according to company and plant rules and regulations.Perform equipment safety inspections, as required, and record results as appropriate. Participate in small construction projects.  Read and interpret drawings, sketches, prints, and specifications, as required.Orders parts as needed to affect maintenance and repair.Performs Operations tasks on an as-needed basis and other tasks as assigned.Available within a reasonable response time for emergency call-ins and overtime, plus provide acceptable off-hour contact by phone and company pager.          Excellent Verbal and Written Communications Skills:Ability to coordinate work activities with other team members on technical subjects across job families.Ability to work weekends, holidays, and rotating shifts, as required.")]
vectorizer = mymodel["vectorizer"]
vec_test = vectorizer.transform(text).toarray()
test_data = pd.DataFrame(vec_test, columns= vectorizer.get_feature_names())

print(mymodel["model"].predict(vec_test))
print(

"""
frac=0.07
re shape
0    0.611461
1    0.388539
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.77      0.99      0.87       189
           1       0.99      0.56      0.71       129

    accuracy                           0.82       318
   macro avg       0.88      0.78      0.79       318
weighted avg       0.86      0.82      0.80       318

col_0         0   1
fraudulent         
0           188   1
1            57  72""")

with open("model_svm.pkl", "wb") as f:
    pickle.dump(mymodel["model"], f)
    
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(mymodel["vectorizer"], f)

{'model': SVC(), 'vectorizer': TfidfVectorizer(lowercase=False, max_features=50000, ngram_range=(1, 3))}
[1]

frac=0.07
re shape
0    0.611461
1    0.388539
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.77      0.99      0.87       189
           1       0.99      0.56      0.71       129

    accuracy                           0.82       318
   macro avg       0.88      0.78      0.79       318
weighted avg       0.86      0.82      0.80       318

col_0         0   1
fraudulent         
0           188   1
1            57  72


## Others tested tuning

### With a sample of 12% of true

In [88]:

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

# frac = [x * 0.01 for x in range(6, 14)]
model = trainer(df, svm.SVC(), df_fraud, df_true, 0.12)

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.12
re shape
0    0.729103
1    0.270897
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       322
           1       0.98      0.39      0.55       135

    accuracy                           0.82       457
   macro avg       0.89      0.69      0.72       457
weighted avg       0.85      0.82      0.79       457

col_0         0   1
fraudulent         
0           321   1
1            83  52

end


### With a loop for testing a diversity of values of df_true.shape

In [107]:
df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.01 for x in range(3, 14)]
models = [trainer(df, svm.SVC(), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95741
1    0.04259
Name: fraudulent, dtype: float64

frac = 0.03
re shape
1    0.597289
0    0.402711
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.82      0.70      0.76        84
           1       0.81      0.89      0.85       123

    accuracy                           0.82       207
   macro avg       0.82      0.80      0.80       207
weighted avg       0.82      0.82      0.81       207

col_0        0    1
fraudulent         
0           59   25
1           13  110


frac = 0.04
re shape
1    0.526451
0    0.473549
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       114
           1       0.80      0.84      0.82       121

    accuracy                           0.81       235
   macro avg       0.81      0.81      0.81       235
weighted avg       0.81      0.81      0.81       235

col_0        0    1
fraudulent         


In [81]:
df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

# By steps of 10%
frac = [x * 0.1 for x in range(1, 10)]
models = [trainer(df, svm.SVC(), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.1
re shape
0    0.691579
1    0.308421
Name: fraudulent, dtype: float64


TypeError: file must have a 'write' attribute

### An example of the behavior with no deduplication of datas

In [52]:
df = pd.read_csv("dataset.csv", sep = ";")

def cleandataset(df):
    df["description"] = df["description"].str.replace("&amp;", "&", regex=False)
    df["description"] = df["description"].str.replace("\xa0", " ", regex=False)
    df["description"] = df['description'].str.replace("!", "! ", regex=False)
    df["description"] = df['description'].str.replace("?", "? ", regex=False)
    df["description"] = df['description'].str.replace(":", " ", regex=False)
    df["description"] = df['description'].str.replace("...", " ", regex=False)
    df["description"] = df['description'].str.replace("  +", " ", regex=True)
    df["description"] = df['description'].str.replace("([a-z]{2,})([A-Z])", "\g<1> \g<2>", regex=True)
    df["description"] = df['description'].str.replace("([a-z\.]{2,})([A-Z])", "\g<1> \g<2>", regex=True)
    df["description"] = df['description'].str.lower()
    
    df.dropna(inplace=True)
#     df.drop_duplicates(inplace=True)
    return df


def clean_data(text):
#     text = re.sub('[^a-zA-Z]' , ' ' , text)
    if type(text) is float:
        return ""
    tokens = text.split() 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if not word in stopwords]  
    clean_description = ' '.join(tokens)     
    return clean_description

df['description'] = df['description'].apply(lambda x : clean_data(x))


print(df.shape)
df = cleandataset(df)
print(df.shape)


# check class distribution
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

print(df_fraud.shape)

df_true_sample = df_true.sample(frac=0.1, random_state=0)
print(df_true.shape)
print(df_true_sample.shape)

df_reshape = pd.concat([df_fraud, df_true_sample])
df_reshape = df_reshape.sample(frac=1)
# check class distribution
print(df_reshape['fraudulent'].value_counts(normalize = True))

vectorizer = TfidfVectorizer(
    max_features = 50000 , 
    lowercase=False , 
    ngram_range=(1,3))

X = df_reshape.description
y = df_reshape.fraudulent

from sklearn.model_selection import train_test_split

train_X , test_X , train_y , test_y = train_test_split(X , y , test_size = 0.2 ,random_state = 0)

vec_train = vectorizer.fit_transform(train_X)
vec_train = vec_train.toarray()

vec_test = vectorizer.transform(test_X).toarray()

train_data = pd.DataFrame(vec_train , columns=vectorizer.get_feature_names())
test_data = pd.DataFrame(vec_test , columns= vectorizer.get_feature_names())

from sklearn import svm
model_svm = svm.SVC()

model_svm.fit(train_data, train_y)
predictions  = model_svm.predict(test_data)

print(classification_report(test_y , predictions))
# confusion matrix
print(pd.crosstab(test_y, predictions))

model_svm2 = model_svm


(17880, 2)
(17880, 2)
0    0.951566
1    0.048434
Name: fraudulent, dtype: float64
(866, 2)
(17014, 2)
(1701, 2)
0    0.662641
1    0.337359
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.87      0.99      0.93       344
           1       0.98      0.70      0.82       170

    accuracy                           0.89       514
   macro avg       0.92      0.85      0.87       514
weighted avg       0.90      0.89      0.89       514

col_0         0    1
fraudulent          
0           341    3
1            51  119


# The diversity of models

I had try a lot of models before to chose one.

## SVM
The tuning by class_weight doesn't change the game. (Maybe others parameters ?)

In [78]:
from sklearn import svm

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.1 for x in range(0, 10)]
model = trainer(df, svm.SVC(class_weight={1: 22}), df_fraud, df_true, 1)

print("end")

# with open('model_svm.pkl', "wb") as f:
#     pickle.dump(model, f)
print(pd.crosstab(test_y, predictions))

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 1
re shape
0    0.95731
1    0.04269
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2764
           1       0.93      0.40      0.56       136

    accuracy                           0.97      2900
   macro avg       0.95      0.70      0.77      2900
weighted avg       0.97      0.97      0.97      2900

col_0          0   1
fraudulent          
0           2760   4
1             81  55

end


TypeError: file must have a 'write' attribute

In [83]:
with open('model_svm.pkl', "wb") as f:
    pickle.dump(model, f)
print(pd.crosstab(test_y, predictions))

col_0         0   1
fraudulent         
0           233   7
1            49  57


## Naive Bayes - MultinomailNB
Could be usefull if we want to never have a true classed like fake, but with less trues falses.

In [72]:
from sklearn.naive_bayes import MultinomialNB

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.01 for x in range(6, 15)]
models = [trainer(df, MultinomialNB(), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.06
re shape
0    0.573691
1    0.426309
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.68      0.99      0.80       163
           1       0.98      0.39      0.56       128

    accuracy                           0.73       291
   macro avg       0.83      0.69      0.68       291
weighted avg       0.81      0.73      0.70       291

col_0         0   1
fraudulent         
0           162   1
1            78  50


frac = 0.07
re shape
0    0.610937
1    0.389063
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.69      0.99      0.81       195
           1       0.97      0.29      0.45       124

    accuracy                           0.72       319
   macro avg       0.83      0.64      0.63       319
weighted avg       0.80      0.72      0.67       319

col_0         0   1
fraudulent         


## RandomForest
The result was not good.

In [73]:
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.01 for x in range(6, 15)]
models = [trainer(df, RandomForestClassifier(n_estimators=20), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.06
re shape
0    0.573691
1    0.426309
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.79      0.91      0.85       163
           1       0.86      0.69      0.77       128

    accuracy                           0.81       291
   macro avg       0.83      0.80      0.81       291
weighted avg       0.82      0.81      0.81       291

col_0         0   1
fraudulent         
0           149  14
1            40  88


frac = 0.07
re shape
0    0.610937
1    0.389063
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.79      0.95      0.86       195
           1       0.89      0.60      0.71       124

    accuracy                           0.82       319
   macro avg       0.84      0.78      0.79       319
weighted avg       0.83      0.82      0.81       319

col_0         0   1
fraudulent         


## ExtraTreesClassifier
Similare results than svm, but less good.

In [74]:
from sklearn.ensemble import ExtraTreesClassifier
df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.01 for x in range(6, 15)]
models = [trainer(df, ExtraTreesClassifier(n_estimators=20), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.06
re shape
0    0.573691
1    0.426309
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.76      0.95      0.84       163
           1       0.91      0.62      0.73       128

    accuracy                           0.80       291
   macro avg       0.83      0.78      0.79       291
weighted avg       0.83      0.80      0.80       291

col_0         0   1
fraudulent         
0           155   8
1            49  79


frac = 0.07
re shape
0    0.610937
1    0.389063
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.79      0.96      0.87       195
           1       0.90      0.60      0.72       124

    accuracy                           0.82       319
   macro avg       0.85      0.78      0.79       319
weighted avg       0.83      0.82      0.81       319

col_0         0   1
fraudulent         


## IsolationForest

In [76]:
from sklearn.ensemble import IsolationForest

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.1 for x in range(1, 10)]
models = [trainer(df, IsolationForest(contamination=float(0.01),random_state=42), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.1
re shape
0    0.691579
1    0.308421
Name: fraudulent, dtype: float64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00       264
           1       0.34      0.99      0.51       138

    accuracy                           0.34       402
   macro avg       0.11      0.33      0.17       402
weighted avg       0.12      0.34      0.18       402

col_0       -1    1
fraudulent         
0            3  261
1            1  137


frac = 0.2
re shape
0    0.817673
1    0.182327
Name: fraudulent, dtype: float64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00       549
           1       0.19      1.00      0.33       130

    accuracy                           0.19       679
   macro avg       0.06      0.33      0.11       679
weighted avg       0.04      0.19      0.06       679

col_0       -1    1
fraudulent         
0            9  540
1            0  130


frac = 0.30000000000000004
re shape
0    0.870583
1    0.129417
Name: fraudulent, dtype: float64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00       824
           1       0.14      1.00      0.24       133

    accuracy                           0.14       957
   macro avg       0.05      0.33      0.08       957
weighted avg       0.02      0.14      0.03       957

col_0       -1    1
fraudulent         
0            4  820
1            0  133


frac = 0.4
re shape
0    0.899692
1    0.100308
Name: fraudulent, dtype: float64


KeyboardInterrupt: 

## KNeighborsClassifier
Well balanced results.

In [77]:
from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv("dataset.csv", sep = ";")

df['description'] = df['description'].apply(lambda x : clean_data(x))
df = cleandataset(df)

# check class distribution/
print(df['fraudulent'].value_counts(normalize = True))

df_fraud = df[df.fraudulent == 1]
df_true = df[df.fraudulent == 0]

frac = [x * 0.01 for x in range(6, 15)]
models = [trainer(df, KNeighborsClassifier(n_neighbors=2), df_fraud, df_true, step) for step in frac]

print("end")

0    0.95731
1    0.04269
Name: fraudulent, dtype: float64

frac = 0.06
re shape
0    0.573691
1    0.426309
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       163
           1       0.85      0.83      0.84       128

    accuracy                           0.86       291
   macro avg       0.86      0.86      0.86       291
weighted avg       0.86      0.86      0.86       291

col_0         0    1
fraudulent          
0           145   18
1            22  106


frac = 0.07
re shape
0    0.610937
1    0.389063
Name: fraudulent, dtype: float64
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       195
           1       0.79      0.77      0.78       124

    accuracy                           0.83       319
   macro avg       0.83      0.82      0.82       319
weighted avg       0.83      0.83      0.83       319

col_0         0   1
fraudulent      

## Ensemble classifier

Too long to train, then to test tuning. Maybe later.

In [20]:
# Bagged Decision Trees for Classification - necessary dependencies

from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [22]:
kfold = model_selection.KFold(n_splits=10, random_state=7, shuffle=True)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7)
results = model_selection.cross_val_score(model, train_data, train_y, cv=kfold)
print(results.mean())

0.833946895165556


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [26]:

print("score train")
print(model.score(train_data, train_y))
print("score test")
print(model.score(test_data, test_y))

print("MSE : ", np.sqrt(((test_y - model.predict(test_data)) ** 2).mean()))

NotFittedError: This BaggingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# AdaBoost Classification

from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

print(classification_report(test_y , predictions))
# confusion matrix
pd.crosstab(test_y, predictions)

In [None]:
# Voting Ensemble for Classification

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())


print(classification_report(test_y , predictions))
# confusion matrix
pd.crosstab(test_y, predictions)

## xgboost

Too long to train. Must write a f1-score + recall score + confusion matrix... First results are very bad.

In [18]:
import xgboost as xgb
import numpy as np


clf = xgb.XGBRegressor(random_state=0)

# clf = xgb.XGBRegressor(random_state=0, n_jobs=6, max_depth=8, grow_policy='lossguide', max_leaves=100,
#                              max_bin=64, reg_alpha=0, reg_lambda=0, n_estimators=100, learning_rate=0.1,
#                              tree_method='auto')

clf.fit(train_data, train_y)

print("score train")
print(clf.score(train_data, train_y))
print("score test")
print(clf.score(test_data, test_y))

print("MSE : ", np.sqrt(((test_y - clf.predict(test_data)) ** 2).mean()))

score train
0.9791990093389004
score test
0.4548039952330938
MSE :  0.3679619445889928


In [None]:
# from sklearn.metrics import classification_report, confusion_matrix
# cm = confusion_matrix(test_y, predictions)
# cm