## Import packages and load data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno 
import matplotlib

from google.colab import files
import io

from google.colab import drive
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

import re
import gensim
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
#versions
print('pandas: ' + pd.__version__)
print('numpy: ' + np.__version__)
print('seaborn: ' + sns.__version__)
print('plt: ' + matplotlib.__version__)
print('nltk: ' + nltk.__version__)
print('gensim: ' + gensim.__version__)

pandas: 1.3.5
numpy: 1.21.6
seaborn: 0.11.2
plt: 3.2.2
nltk: 3.7
gensim: 3.6.0


## Loading Files

#### GDrive

In [None]:
##mounting
# drive.mount('/content/gdrive')
# jobs_df = pd.read_csv('gdrive/My Drive/BT4012/fake_job_postings.csv')

##alternative loading
#ul = files.upload()
#jobs_df = pd.read_csv(io.BytesIO(ul['fake_job_postings.csv']))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### Local

In [None]:
jobs_df = pd.read_csv('../datasets/fake_job_postings.csv')

## Cleaning Data



#### Cleaning Data Types

In [None]:
jobs_df['job_id'] = jobs_df[jobs_df['job_id'].notna()]['job_id'].astype('str')
jobs_df['salary_range'] = jobs_df[jobs_df['salary_range'].notna()]['salary_range'].astype('str') #classify as string as there are 875 unique values, too many categories
jobs_df['telecommuting'] = jobs_df[jobs_df['telecommuting'].notna()]['telecommuting'].astype('boolean')
jobs_df['has_company_logo'] = jobs_df[jobs_df['has_company_logo'].notna()]['has_company_logo'].astype('boolean')
jobs_df['has_questions'] = jobs_df[jobs_df['has_questions'].notna()]['has_questions'].astype('boolean')
jobs_df['employment_type'] = jobs_df[jobs_df['employment_type'].notna()]['employment_type'].astype('category')
jobs_df['location'] = jobs_df[jobs_df['location'].notna()]['location'].astype('category')
jobs_df['required_education'] = jobs_df[jobs_df['required_education'].notna()]['required_education'].astype('category')
jobs_df['required_experience'] = jobs_df[jobs_df['required_experience'].notna()]['required_experience'].astype('category')
jobs_df['function'] = jobs_df[jobs_df['function'].notna()]['function'].astype('category') 
jobs_df['fraudulent'] = jobs_df[jobs_df['fraudulent'].notna()]['fraudulent'].astype('category')
jobs_df['industry'] = jobs_df[jobs_df['industry'].notna()]['industry'].astype('str') #132 cats, too many categories
jobs_df['company_profile'] = jobs_df[jobs_df['company_profile'].notna()]['company_profile'].astype('str')
jobs_df['benefits'] = jobs_df[jobs_df['benefits'].notna()]['benefits'].astype('str')
jobs_df['description'] = jobs_df[jobs_df['description'].notna()]['description'].astype('str')
jobs_df['requirements'] = jobs_df[jobs_df['requirements'].notna()]['requirements'].astype('str')

#### Categorising Presence of Salary

In [None]:
jobs_df['salary_indicated'] = jobs_df['salary_range'].apply(lambda x: 1 if x is not np.nan else 0)

In [None]:
jobs_df = jobs_df.drop(['salary_range'], axis = 1)

#### Cleaning Location

- standardise to country code

In [None]:
#convert location to cat
jobs_df['location']= jobs_df['location'].str[:2]

#### Cleaning Industry

In [None]:
# Filling in nulls by imputing industry based on company_profile
list_of_available_industries = jobs_df['industry'].unique()
list_of_available_industries = [x for x in list_of_available_industries if pd.notnull(x)]
new_industry_list = []

for _,row in jobs_df.iterrows():
  if str(row['industry']) != 'nan':
    new_industry_list.append(row['industry'])
  else:
    broke = False
    for industry in list_of_available_industries:
      if str(row['company_profile']) != 'nan':
        if industry.lower() in row['company_profile'].lower():
          new_industry_list.append(industry)
          broke = True
          break
    if broke == False:
      new_industry_list.append(row['industry'])

In [None]:
jobs_df['industry'] = new_industry_list # Cleaned industry nulls reduced from 4903 to 3604

#### Reducing Cardinality

In [None]:
#Education
jobs_df['required_education'] = jobs_df['required_education'].replace({'Vocational': 'Vocational',
                                                                       'Vocational - Degree': 'Vocational', 
                                                                       'Vocational - HS Diploma': 'Vocational',
                                                                        'Some High School Coursework': 'Normal education',
                                                                        'Some College Coursework Completed': 'Normal education',
                                                                        'High School or equivalent' : 'Normal education',
                                                                        'Associate Degree': 'Higher Education',
                                                                        "Bachelor's Degree": 'Higher Education', 
                                                                        "Master's Degree" : 'Highest Education', 
                                                                        'Professional': 'Highest Education', 
                                                                         'Doctorate' : 'Highest Education'
                                                                        })

In [None]:
#Industry
jobs_df['industry'] = jobs_df['industry'].replace({'Public Policy' : 'Government and Legislation',
                                                  'Sports' : 'Entertainment',
                                                  'Motion Pictures and Film' : 'Entertainment',
                                                  'Transportation/Trucking/Railroad' : 'Transportation',
                                                  'Maritime' : 'Transportation',
                                                  'Warehousing' : 'Manufacturing',
                                                  'Cosmetics' : 'Fashion',
                                                  'Medical Devices' : 'Healthcare',
                                                  'Computer Networking' : 'Computer and Technology',
                                                  'Commercial Real Estate' : 'Construction',
                                                  'Information Services' : 'Computer and Technology',
                                                  'Online Media' : 'Media and News',
                                                  'Individual & Family Services' : 'Non-Profit Organisation',
                                                  'Utilities' : 'Energy',
                                                  'Plastics' : 'Manufacturing',
                                                  'Farming' : 'Agriculture',
                                                  'Research' : 'Education',
                                                  'Biotechnology' : 'Computer and Technology',
                                                  'Logistics and Supply Chain' : 'Manufacturing',
                                                  'Graphic Design' : 'Advertising and marketing',
                                                  'Airlines/Aviation' : 'Aerospace',
                                                  'Computer Games' : 'Computer and Technology',
                                                  'Real Estate' : 'Construction',
                                                  'Human Resources' : 'HR and Consulting',
                                                  'Public Safety' : 'Government and Legislation',
                                                  'Retail' : 'Fashion',
                                                  'Animation' : 'Entertainment',
                                                  'Shipbuilding' : 'Construction',
                                                  'Fishery' : 'Agriculture',
                                                  'Outsourcing/Offshoring' : 'Manufacturing',
                                                  'Medical Practice' : 'Healthcare',
                                                  'Hospital & Health Care' : 'Healthcare',
                                                  'Apparel & Fashion' : 'Fashion',
                                                  'Military' : 'Government and Legislation',
                                                  'Education Management' : 'Education',
                                                  'Performing Arts' : 'Entertainment',
                                                  'Computer Hardware' : 'Computer and Technology',
                                                  'Industrial Automation' : 'Computer and Technology',
                                                  'Civil Engineering' : 'Construction',
                                                  'Government Administration' : 'Government and Legislation',
                                                  'Hospitality' : 'Hospitality',
                                                  'Broadcast Media' : 'Media and News',
                                                  'Telecommunications' : 'Telecommunication',
                                                  'Professional Training & Coaching' : 'Education',
                                                  'Luxury Goods & Jewelry' : 'Fashion',
                                                  'Security and Investigations' : 'Government and Legislation',
                                                  'Computer & Network Security' : 'Computer and Technology',
                                                  'Packaging and Containers' : 'Manufacturing',
                                                  'Entertainment' : 'Entertainment',
                                                  'Civic & Social Organization' : 'Non-Profit Organisation',
                                                  'Business Supplies and Equipment' : 'Manufacturing',
                                                  'Mental Health Care' : 'Healthcare',
                                                  'Import and Export' : 'Manufacturing',
                                                  'Legal Services' : 'Government and Legislation',
                                                  'Oil & Energy' : 'Energy',
                                                  'Fund-Raising' : 'Non-Profit Organisation',
                                                  'Computer Software' : 'Computer and Technology',
                                                  'Leisure, Travel & Tourism' : 'Hospitality',
                                                  'Information Technology and Services' : 'Computer and Technology',
                                                  'Banking' : 'Finance and Economic',
                                                  'Mechanical or Industrial Engineering' : 'Manufacturing',
                                                  'Executive Office' : 'HR and Consulting',
                                                  'Furniture' : 'Manufacturing',
                                                  'Sporting Goods' : 'Fashion',
                                                  'Translation and Localization' : 'Media and News',
                                                  'Media Production' : 'Media and News',
                                                  'Capital Markets' : 'Finance and Economic',
                                                  'Food & Beverages' : 'Food and Beverage',
                                                  'Internet' : 'Computer and Technology',
                                                  'Primary/Secondary Education' : 'Education',
                                                  'Photography' : 'Entertainment',
                                                  'Gambling & Casinos' : 'Entertainment',
                                                  'Consumer Goods' : 'Manufacturing',
                                                  'Design' : 'Fashion',
                                                  'Electrical/Electronic Manufacturing' : 'Manufacturing',
                                                  'Wholesale' : 'Manufacturing',
                                                  'Venture Capital & Private Equity' : 'Finance and Economic',
                                                  'Financial Services' : 'Finance and Economic',
                                                  'Music' : 'Entertainment',
                                                  'Events Services' : 'Entertainment',
                                                  'Mining & Metals' : 'Manufacturing',
                                                  'Machinery' : 'Manufacturing',
                                                  'Package/Freight Delivery' : 'Transportation',
                                                  'Architecture & Planning' : 'Construction',
                                                  'Nanotechnology' : 'Computer and Technology',
                                                  'Consumer Services' : 'Hospitality',
                                                  'Program Development' : 'HR and Consulting',
                                                  'Management Consulting' : 'HR and Consulting',
                                                  'Consumer Electronics' : 'Entertainment',
                                                  'Publishing' : 'Media and News',
                                                  'Building Materials' : 'Construction',
                                                  'Chemicals' : 'Manufacturing',
                                                  'Law Practice' : 'Government and Legislation',
                                                  'Construction' : 'Construction',
                                                  'Philanthropy' : 'Non-Profit Organisation',
                                                  'Accounting' : 'Finance and Economic',
                                                  'Investment Management' : 'Finance and Economic',
                                                  'Environmental Services' : 'Energy',
                                                  'Investment Banking' : 'Finance and Economic',
                                                  'Automotive' : 'Transportation',
                                                  'Renewables & Environment' : 'Energy',
                                                  'Wireless' : 'Computer and Technology',
                                                  'Writing and Editing' : 'Media and News',
                                                  'Government Relations' : 'Government and Legislation',
                                                  'Libraries' : 'Education',
                                                  'Textiles' : 'Manufacturing',
                                                  'Higher Education' : 'Education',
                                                  'Ranching' : 'Agriculture',
                                                  'Nonprofit Organization Management' : 'Non-Profit Organisation',
                                                  'Semiconductors' : 'Manufacturing',
                                                  'Facilities Services' : 'Entertainment',
                                                  'Religious Institutions' : 'Non-Profit Organisation',
                                                  'Pharmaceuticals' : 'Healthcare',
                                                  'Insurance' : 'Healthcare',
                                                  'International Trade and Development' : 'Finance and Economic',
                                                  'Printing' : 'Manufacturing',
                                                  'Health, Wellness and Fitness' : 'Healthcare',
                                                  'Wine and Spirits' : 'Food and Beverage',
                                                  'Restaurants' : 'Food and Beverage',
                                                  'Market Research' : 'HR and Consulting',
                                                  'Aviation & Aerospace' : 'Aerospace',
                                                  'Defense & Space' : 'Aerospace',
                                                  'Veterinary' : 'Healthcare',
                                                  'Marketing and Advertising' : 'Advertising and marketing',
                                                  'E-Learning' : 'Education',
                                                  'Law Enforcement' : 'Government and Legislation',
                                                  'Staffing and Recruiting' : 'Hospitality',
                                                  'Public Relations and Communications' : 'Media and News',
                                                  'Food Production' : 'Food and Beverage',
                                                  'Museums and Institutions' : 'Education',
                                                  'Alternative Dispute Resolution' : 'Hospitality'})

#### Cleaning text: Concatenate and lemmatise relevant text

In [None]:
jobs_df.shape

(17880, 18)

In [None]:
jobs_df['concat_text'] = jobs_df[["company_profile", "description", "requirements", "benefits"]].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [None]:
jobs_df['concat_text'] = jobs_df['concat_text'].str.strip().replace("", np.nan)

In [None]:
jobs_df = jobs_df.dropna(subset = ['concat_text'])

In [None]:
jobs_df.shape

(17879, 19)

In [None]:
def remove_link_punc(string):
    temp_string = re.sub('http[s]?://[^ ]+', ' ', string)
    temp_string = re.sub("\S*\d\S*", " ", temp_string)
    temp_string = re.sub("&[^\s]+", " ", temp_string)
    regex = re.compile('[^a-zA-Z]')
    temp_string = regex.sub(' ', temp_string)
    temp_string = re.sub(r"([a-z])([A-Z])", r"\1 \2", temp_string)
    clean_string = re.sub('\s+', ' ', temp_string).lower()
    return clean_string.strip()

In [None]:
jobs_df['clean_text'] = jobs_df['concat_text'].apply(remove_link_punc)

In [None]:
def tokenize_lemmatize(string):
    lemmatizer = WordNetLemmatizer()
    word_list = word_tokenize(string)
    return [lemmatizer.lemmatize(w) for w in word_list]

In [None]:
jobs_df['lst_tokens'] = jobs_df['clean_text'].apply(tokenize_lemmatize)

In [None]:
jobs_df = jobs_df.drop(['clean_text'], axis = 1)

#### Cleaning text: Remove non - english text

- There could be presence of other languages in textual data. Some job postings have both english and foreign language in their textual data. However, it does not appear for seen fraud cases. 

- Since this is not a useful distinguishing feature of non-fraud, we filter non-english postings out

In [None]:
!pip install langdetect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import langdetect

# explore languages for fraudulent company profile first
text_df = pd.DataFrame()

#we analyse language detected in company profile as it has the least null values
copy_df = jobs_df.copy().dropna(subset = ['company_profile'])
txt = copy_df.loc[:,'company_profile']
copy_df['lang'] = txt.apply(lambda x: langdetect.detect(x) if x.strip() != "" else "")

In [None]:
jobs_df = copy_df[copy_df['lang'].str.contains('en')]

In [None]:
jobs_df.shape

(14396, 21)

In [None]:
jobs_df = jobs_df.drop(['lang'], axis = 1)

#### Remove uneeeded columns

In [None]:
jobs_df = jobs_df.drop(['title','department'], axis = 1)

## Feature Engineering

In [None]:
jobs_df['num_words_company_profile'] = jobs_df['company_profile'].str.split().map(lambda x: len(x) if type(x) != float else 0)

## Train test split

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
cols_remove = ['fraudulent']
X_cols = [i for i in list(jobs_df.columns) if i not in cols_remove]

In [None]:
# Train test split
X = jobs_df[X_cols]
y = jobs_df['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2022, shuffle = True , stratify = y) # 80-20 split

In [None]:
X_train = X_train.reset_index(drop = True)

In [None]:
X_test = X_test.reset_index(drop = True)

## Removing unneeded columns

In [None]:
X_train = X_train.drop(columns = ['job_id', 'company_profile', 'description', 'requirements', 'benefits'])
X_test = X_test.drop(columns = ['job_id', 'company_profile', 'description', 'requirements', 'benefits'])

## Word Embeddings

In [None]:
embeddings_dct = {}
with open('gdrive/My Drive/BT4012/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:],'float32')
        embeddings_dct[word] = vector
f.close()

In [None]:
glove_words =  set(embeddings_dct.keys())
print(f'{len(glove_words)} word vectors')

400000 word vectors


In [None]:
def convert_to_vector(sentence):
    vector = np.zeros(100)
    num_words =0 
    for word in sentence:
        if word in glove_words:
            vector += embeddings_dct[word]
            num_words += 1
    if num_words != 0:
        vector /= num_words
    return vector

In [None]:
X_train_vect_avg = []
for i in range(0, X_train.shape[0]):
    X_train_vect_avg.append(convert_to_vector(X_train['lst_tokens'][i]))

In [None]:
X_train.shape

(11516, 13)

In [None]:
print(f'X_train_vect_avg length: {len(X_train_vect_avg)}\nVector shape: {X_train_vect_avg[0].shape}')

X_train_vect_avg length: 11516
Vector shape: (100,)


In [None]:
X_test_vect_avg = []
for i in range(0, X_test.shape[0]):
    X_test_vect_avg.append(convert_to_vector(X_test['lst_tokens'][i]))

In [None]:
print(f'X_test_vect_avg length: {len(X_test_vect_avg)}\nVector shape: {X_test_vect_avg[0].shape}')

X_test_vect_avg length: 2880
Vector shape: (100,)


In [None]:
X_train = X_train.drop(['concat_text','lst_tokens'], axis=1)
X_test = X_test.drop(['concat_text','lst_tokens'], axis=1)

## One Hot Encoding

#### Dropping Columns that do not need to be OHE

In [None]:
X_train_OHE = X_train.drop(['num_words_company_profile'], axis = 1)
X_test_OHE = X_test.drop(['num_words_company_profile'], axis = 1)

#### OHE

In [None]:
# standardise boolean to be True/ False
X_train_OHE['salary_indicated'] = X_train_OHE['salary_indicated'].apply(lambda x: x == 1)
X_test_OHE['salary_indicated'] = X_train_OHE['salary_indicated'].apply(lambda x: x == 1)
# add new row to account for values in test dataset unseen in train dataset
new_row = pd.Series({'location': 'Unseen', 'telecommuting': False, 'has_company_logo' : False, 'has_questions': False,
       'employment_type': 'Unseen', 'required_experience': 'Unseen', 'required_education': 'Unseen',
       'industry': 'Unseen', 'function': 'Unseen', 'salary_indicated': False})
X_train_OHE = pd.concat([X_train_OHE, new_row.to_frame().T], ignore_index=True)

In [None]:
# check number of unique values for each categorical feature in train set
X_train_uniq = pd.DataFrame(list(X_train_OHE.columns)).rename(columns={0:"attribute"})
X_train_uniq['uniq_val'] = [X_train_OHE[attribute].nunique() for attribute in X_train_OHE.columns]

In [None]:
# check number of unique values for each categorical feature in test set
X_test_uniq = pd.DataFrame(list(X_test_OHE.columns)).rename(columns={0:"attribute"})
X_test_uniq['uniq_val'] = [X_test_OHE[attribute].nunique() for attribute in X_test_OHE.columns]

In [None]:
# OHE for train set then remove artificially created row for unseen
X_train_transformed = pd.get_dummies(data=X_train_OHE, columns=X_train_OHE.columns, dummy_na=True)[:-1]
X_train_OHE = X_train_OHE[:-1]

In [None]:
# Helper function to one hot encode test set
def OHE_test(df, train_cols):
  # Empty df to concatenate transformed rows
  X_test_transformed = pd.DataFrame(columns=train_cols)

  for row in np.array(df):
    # start with all zeroes for each row
    transformed_row = np.zeros(len(train_cols),dtype=int)

    # mark relevant columns as 1
    for idx in range(df.shape[1]):

      col_val = row[idx]
      if not isinstance(col_val, str):
        col_val = str(col_val)

      transformed_col_name = df.columns[idx] + '_' + col_val
      transformed_col_unseen = df.columns[idx] + '_' + 'Unseen'

      if transformed_col_name in train_cols:
        transformed_row[list(train_cols).index(transformed_col_name)] = 1
      else:
        transformed_row[list(train_cols).index(transformed_col_unseen)] = 1

    #append to df
    X_test_transformed = pd.concat([X_test_transformed, pd.DataFrame(transformed_row, train_cols).T])

  return X_test_transformed

In [None]:
X_test_transformed = OHE_test(X_test_OHE, X_train_transformed.columns)

#### Cleaning final df to be imputed

In [None]:
X_test_transformed = X_test_transformed.reset_index(drop = True)

In [None]:
## Drop nan columns and replace with nan values instead
def replace_nan_columns(df):
  columns_nan = [i for i in df if 'nan' in i.split('_')]

  for idx,row in df.iterrows():
    print(idx)
    for nan_col in columns_nan:
      if row[nan_col]  == 1:
        col = nan_col.replace('_nan',"")
        
        for val_col in df.columns:
          if col in val_col:
            df.loc[idx,val_col] = np.nan
  return df


In [None]:
X_train_final = replace_nan_columns(X_train_transformed)
X_test_final = replace_nan_columns(X_test_transformed)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582


## KNN Imputer

In [None]:
## optimal k value: square root of n (119)
import math

k = int(math.sqrt(len(X_train_final)))

In [None]:
from sklearn import preprocessing
from sklearn.impute import KNNImputer
import numpy as np

In [None]:
imputer = KNNImputer(n_neighbors=1)
X_train_imputed = imputer.fit_transform(X_train_final)

In [None]:
X_train_imputed = pd.DataFrame(X_train_imputed, columns = X_train_final.columns)

In [None]:
X_test_imputed = imputer.transform(X_test_final)
X_test_imputed = pd.DataFrame(X_test_imputed, columns = X_train_final.columns)

## Adding back variables excluded from imputation

In [None]:
X_train_imputed['num_words_company_profile'] = X_train['num_words_company_profile']

In [None]:
X_test_imputed['num_words_company_profile'] = X_test['num_words_company_profile']

In [None]:
X_train_imputed = X_train_imputed.join(pd.DataFrame(X_train_vect_avg))
X_test_imputed = X_test_imputed.join(pd.DataFrame(X_test_vect_avg))

In [None]:
X_train_imputed.head()

Unnamed: 0,location_AE,location_AM,location_AR,location_AT,location_AU,location_BD,location_BE,location_BG,location_BH,location_BR,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.030737,-0.012534,-0.06404,0.065706,-0.349803,0.033919,-0.09198,-0.321193,0.615262,0.263603
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.052558,-0.086892,-0.148021,0.07955,-0.355762,0.014088,-0.189971,-0.369666,0.591911,0.176678
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.053322,-0.013034,-0.110459,0.056845,-0.270515,-0.003584,-0.133993,-0.313392,0.569932,0.148196
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.043114,0.014371,-0.007252,-0.01004,-0.178285,0.080394,-0.03284,-0.250786,0.582996,0.198129
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.005971,-0.05107,-0.119494,0.055244,-0.290366,0.020901,-0.173409,-0.269942,0.668699,0.136355


## Removing columns with column name that includes nan

In [None]:
X_train_imputed_final = X_train_imputed[X_train_imputed.columns.drop(list(X_train_imputed.filter(regex='nan')))]
X_test_imputed_final = X_test_imputed[X_test_imputed.columns.drop(list(X_test_imputed.filter(regex='nan')))]

In [None]:
X_train_imputed_final

Unnamed: 0,location_AE,location_AM,location_AR,location_AT,location_AU,location_BD,location_BE,location_BG,location_BH,location_BR,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.030737,-0.012534,-0.064040,0.065706,-0.349803,0.033919,-0.091980,-0.321193,0.615262,0.263603
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.052558,-0.086892,-0.148021,0.079550,-0.355762,0.014088,-0.189971,-0.369666,0.591911,0.176678
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.053322,-0.013034,-0.110459,0.056845,-0.270515,-0.003584,-0.133993,-0.313392,0.569932,0.148196
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.043114,0.014371,-0.007252,-0.010040,-0.178285,0.080394,-0.032840,-0.250786,0.582996,0.198129
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.005971,-0.051070,-0.119494,0.055244,-0.290366,0.020901,-0.173409,-0.269942,0.668699,0.136355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036034,-0.006941,-0.062127,0.000251,-0.388816,0.067166,-0.067062,-0.322177,0.560329,0.285481
11512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.036083,-0.124164,0.001444,0.064195,-0.468939,0.036838,-0.105761,-0.326639,0.624928,0.176993
11513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.011014,-0.014900,-0.113544,0.111955,-0.312688,0.018104,-0.243176,-0.329410,0.605044,0.191440
11514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.069799,-0.006289,-0.027754,-0.037539,-0.362873,-0.017429,-0.167281,-0.357576,0.564600,0.282700


In [None]:
X_train_imputed_final.shape

(11516, 263)

In [None]:
X_test_imputed_final.shape

(2880, 263)

In [None]:
y_train.shape

(11516,)

In [None]:
y_test.shape

(2880,)

In [None]:
set(X_train_imputed['function_Supply Chain'])

{0.0, 1.0}

## Save to Csv

In [None]:
X_train_imputed_final.to_csv('../datasets/glove/train_data_imputed_FINAL.csv', index=False)
# files.download('train_data_imputed_FINAL.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
X_test_imputed_final.to_csv('../datasets/glove/test_data_imputed_FINAL.csv', index=False)
# files.download('test_data_imputed_FINAL.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
y_test.to_csv('../datasets/glove/y_test_FINAL.csv', index=False)
# files.download('y_test_FINAL.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
y_train.to_csv('../datasets/glove/y_train_FINAL.csv', index=False)
# files.download('y_train_FINAL.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>