In [71]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

# Import our metrics to evaluate our model
from sklearn import metrics

# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alexv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alexv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
#importing our data 
df=pd.read_csv('../Data/fake_job_postings.csv')
print(df.shape)
df.head()

(17880, 18)


Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [73]:
print(df.isnull().sum())

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2695
benefits                7210
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64


In [74]:
print(df.duplicated().sum())

0


In [75]:
df.industry.value_counts()

Information Technology and Services    1734
Computer Software                      1376
Internet                               1062
Marketing and Advertising               828
Education Management                    822
                                       ... 
Shipbuilding                              1
Sporting Goods                            1
Museums and Institutions                  1
Wine and Spirits                          1
Ranching                                  1
Name: industry, Length: 131, dtype: int64

In [76]:
#Lowercase all words

def make_lower(a_string):
    return a_string.lower()
a_sentence='This was A SENTENCE with lower and UPPER CASE.'

make_lower(a_sentence)

'this was a sentence with lower and upper case.'

In [77]:
#remove all punctuation

def remove_punctuation(a_string):
    a_string = re.sub(r'[^\w\s]','',a_string)
    
    return a_string

a_sentence = 'This is a sentence! 50 With lots of punctuation??? & other #things.'
remove_punctuation(a_sentence)

'This is a sentence 50 With lots of punctuation  other things'

In [78]:
# Remove all stopwords

def remove_stopwords(a_string):
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []
    
    # Loop through all the words
    for word in words:
        
        # Check if word is not in stopwords
        if word not in stopwords:
            
            # If word not in stopwords, append to our valid_words
            valid_words.append(word)

    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string
            
a_sentence = 'This is a sentence! With some different stopwords i have added in here.'
remove_stopwords(a_sentence)

'This sentence ! With different stopwords added .'

In [79]:
# Break words into their stem words

def stem_words(a_string):
    # Initalize our Stemmer
    porter = PorterStemmer()
    
    # Break the sentence down into a list of words
    words = word_tokenize(a_string)
    
    # Make a list to append valid words into
    valid_words = []

    # Loop through all the words
    for word in words:
        # Stem the word
        stemmed_word = porter.stem(word)
        
        # Append stemmed word to our valid_words
        valid_words.append(stemmed_word)
        
    # Join the list of words together into a string
    a_string = ' '.join(valid_words)

    return a_string 


a_sentence = 'I played and started playing with players and we all love to play with plays'
stem_words(a_sentence)

'i play and start play with player and we all love to play with play'

In [80]:
# Lemmatize words with pos-tags

# def convert_pos(pos):
#     if pos.startswith('V'):
#         return wordnet.VERB
#     elif pos.startswith('J'):
#         return wordnet.ADJ
#     elif pos.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN


# def lem_with_pos_tag(a_string):
#     # Initalize our Lemmer
#     lemmatizer = WordNetLemmatizer()
    
#     # Break the sentence down into a list of words
#     words = word_tokenize(a_string)
    
#     # Get the word and pos_tag for each of the words. 
#     tagged_words = nltk.pos_tag(words)
    
#     # Make a list to append valid words into
#     valid_words = []

#     # Loop through all the words
#     for word in tagged_words:
        
#         # The word is the first element in the tuple
#         the_word = word[0]
        
#         # The pos_tag is the second element in the tuple
#         the_pos_tag = word[1]
        
#         # Convert the pos_tag into the format the lemmatizer accepts
#         the_pos_tag = convert_pos(the_pos_tag)
        
#         # Lemmatize the word with the pos_tag
#         lemmed_word = lemmatizer.lemmatize(the_word, the_pos_tag)
        
#         # Append stemmed word to our valid_words
#         valid_words.append(lemmed_word)
        
#     # Join the list of words together into a string
#     a_string = ' '.join(valid_words)

#     return a_string 

# a_sentence = 'I played and started playing with players and we all love to play with plays'
# another_sentence = 'This is because she wanted to go outside with her friends and play basketball.'
# lem_with_pos_tag(another_sentence)

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle[0m

  Searched in:
    - 'C:\\Users\\Alexv/nltk_data'
    - 'c:\\Users\\Alexv\\anaconda3\\nltk_data'
    - 'c:\\Users\\Alexv\\anaconda3\\share\\nltk_data'
    - 'c:\\Users\\Alexv\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Alexv\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************
