
Date: Sunday, 13 May 2018 



# Task1. Auditing and Cleansing the Job dataset

## 1. Import Libraries

In [None]:
import pandas as pd

## 2. Read a file

In [None]:
data1 = pd.read_csv("dataset1_with_error.csv")
data1.head()

###### Describe the data

In [None]:
data1.describe(include = ['O'])

###### Length of original data

In [None]:
original_length = len(data1)

## 3. Cleansing the data

### Locations

In [None]:
data1['Location'].value_counts().sort_index().head()

###### Cleaning the Locations by correcting their spellings

In [None]:
data1['Location'] = data1['Location'].replace({"Surey":"Surrey","Oxfords":"Oxford","Nottinham":"Nottingham","Reeding":"Reading","Leads":"Leeds"})

###### - Lexical Errors removed

### Salary per annum

In [None]:
data1['Salary per annum'].unique()

###### Changing Salary values from '10K' format to '10000' format

In [None]:
data1['Salary per annum']  = (data1['Salary per annum']).str.replace(r'K', r'000')

###### Changing Salary range to its mean for maintaining consistency thorughout the column

In [None]:
# Splitting 'Salary per annum' column on '-'into two columns
data1[['Salary per annum','salary range']] = data1['Salary per annum'].str.split('-',n=1,expand = True)
data1['salary range'] = pd.to_numeric(data1['salary range'])
data1['Salary per annum'] = pd.to_numeric(data1['Salary per annum'])
# Taking mean of both the columns
data1['Salary per annum'] = data1[['Salary per annum','salary range']].mean(axis=1)
# Dropping the temporary column
data1 = data1.drop(columns=['salary range'])
#data1.head()

###### - Inconsistency and Irregularity removed by making corrections required

### Contract Type

###### Changing 'not available' Contract Type to 'non-specified' as per the description

In [None]:
print(data1['ContractType'].unique())
# Replace function used
data1['ContractType'] = data1['ContractType'].replace({'not available':'non-specified'})
print(data1['ContractType'].unique())

## Contract Time

###### Changing 'not available' Contract Time to 'non-specified' as per the description

In [None]:
print(data1['ContractTime'].unique())
# Replace function used
data1['ContractTime'] = data1['ContractTime'].replace({'not available':'non-specified'})
print(data1['ContractTime'].unique())

### Open Date

###### Splitting Open Date into multiple columns for checking its validity (integrity)

In [None]:
data1['OpenYear'] = data1['OpenDate'].str.slice(0,4)
data1['OpenMonth'] = data1['OpenDate'].str.slice(4,6)
data1['OpenDay'] = data1['OpenDate'].str.slice(6,8)

###### Checking if Month of Open Date is greater than 12

In [None]:
data1[pd.to_numeric(data1['OpenMonth']) > 12]

###### Swapping Month and Day if Month is greater than 12

In [None]:
data1.loc[pd.to_numeric(data1['OpenMonth']) > 12, ['OpenMonth','OpenDay']] = data1.loc[pd.to_numeric(data1['OpenMonth']) > 12, ['OpenDay','OpenMonth']].values
data1[pd.to_numeric(data1['OpenMonth']) > 12]

###### - Violations of the Integrity constraint checked and removed by making corrections

###### Merging columns back to its original form of Open Date

In [None]:
data1['OpenDate'] = data1['OpenYear']+data1['OpenMonth']+data1['OpenDay']+data1['OpenDate'].str.slice(8,)

### Close Date

###### Splitting Close Date into multiple columns for checking its validity (integrity)

In [None]:
data1['CloseYear'] = data1['CloseDate'].str.slice(0,4)
data1['CloseMonth'] = data1['CloseDate'].str.slice(4,6)
data1['CloseDay'] = data1['CloseDate'].str.slice(6,8)

###### Checking if Month of Close Date is greater than 12

In [None]:
data1[pd.to_numeric(data1['CloseMonth']) > 12]

###### Merging columns back to its original form of Close Date

In [None]:
data1['CloseDate'] = data1['CloseYear']+data1['CloseMonth']+data1['CloseDay']+data1['CloseDate'].str.slice(8,)

In [None]:
data1[(data1['OpenYear']==data1['CloseYear'])& (data1['OpenMonth']==data1['CloseMonth'])& (data1['OpenDay']==data1['CloseDate'])] 
# No same date so no need to check time

###### Dropping temporary columns

In [None]:
data1 = data1.drop(columns = ['OpenYear','OpenMonth','OpenDay','CloseYear','CloseMonth','CloseDay'])

In [None]:
#data1.head()

### Open and Close Date

###### Checking if open date is greater than close date and swapping if the condition is true

In [None]:
for index, row in data1.iterrows():
    if row.OpenDate > row.CloseDate:
        
        tempDate = row.OpenDate
        
        data1.set_value(index,'OpenDate',row.CloseDate)
        data1.set_value(index,'CloseDate',tempDate)
        
data1[data1['OpenDate'] > data1['CloseDate']]     

###### - Violations of the Integrity constraint checked and removed by making corrections

### Source Name

In [None]:
data1['SourceName'].value_counts().head()

###### - Assuming Source Name could not have email address

###### SourceName = 'admin@caterer.com' 

In [None]:
# Checking values of the row containing 'admin@caterer.com' as a source name
data1[data1['SourceName'] == 'admin@caterer.com']

###### Observation:
Row with source name 'admin@caterer.com' contain 'Blu Digital' as a company name

###### Checking other records with the same company name

In [None]:
data1[(data1['Company'] == 'Blu Digital')]

###### Replacing source name 'admin@caterer.com' with the source name in other records, consisting of 'Blu Digital' as a company name

In [None]:
for index, row in data1.iterrows():
    if (row.SourceName  != 'admin@caterer.com') & (row.Company == 'Blu Digital'):
        tempSource = row.SourceName
    
    if (row.SourceName  == 'admin@caterer.com') & (row.Company == 'Blu Digital'):
        data1.set_value(index,'SourceName',tempSource)
data1[data1['Company'] == 'Blu Digital']
        

###### - Assuming Source Name should have .com at the end

###### SourceName = 'jobcareer'

In [None]:
# Checking values of the row containing 'jobcareer' as a source name
data1[data1['SourceName'] == 'jobcareer']

###### Observation:
Row with source name 'jobcareer' contain 'Brightwater Group' as a company name

###### Checking other records with the same company name

In [None]:
data1[(data1['Company'] == 'Brightwater Group')]

###### Replacing source name 'jobcareer' with the source name in other records, consisting of 'Brightwater Group' as a company name

In [None]:
for index, row in data1.iterrows():
    if (row.SourceName  != 'jobcareer') & (row.Company == 'Brightwater Group'):
        tempSource = row.SourceName
    
    if (row.SourceName  == 'jobcareer') & (row.Company == 'Brightwater Group'):
        data1.set_value(index,'SourceName',tempSource)
data1[data1['Company'] == 'Brightwater Group']
        

###### SourceName = 'monashstudent'

In [None]:
# Checking values of the row containing 'monashstudent' as a source name
data1[data1['SourceName'] == 'monashstudent']

###### Observation:
Row with source name 'monashstudent' contain 'The A24 Group' as a company name

###### Checking other records with the same company name

In [None]:
data1[(data1['Company'] == 'The A24 Group')]

###### Replacing source name 'monashstudent' with the source name in other records, consisting of 'The A24 Group' as a company name

In [None]:
for index, row in data1.iterrows():
    if (row.SourceName  != 'monashstudent') & (row.Company == 'The A24 Group'):
        tempSource = row.SourceName
    
    if (row.SourceName  == 'monashstudent') & (row.Company == 'The A24 Group'):
        print(tempSource)
        data1.set_value(index,'SourceName',tempSource)
data1[data1['Company'] == 'The A24 Group']
        

###### - Inconsistency is removed

### Company

In [None]:
company1 = data1['Company'].value_counts()
company1.sort_index().head()

In [None]:
# Checking values of the row containing '.' as a Company
data1[data1['Company'] == '.']

###### Observation:
Row with Company  '.'  contain 'PR, Advertising & Marketing Jobs' as a Category, 'UK' as a Location and 'jobstoday.co.uk' as a Source Name

###### Checking other records with the same specifications

In [None]:
data1[(data1['Category'] == 'PR, Advertising & Marketing Jobs') & (data1['Location'] == 'UK') & (data1['SourceName'] == 'jobstoday.co.uk')]

###### Replacing company '.' with the company in other records, consisting of 'PR, Advertising & Marketing Jobs' as a Category, 'UK' as a location 
###### and 'jobstoday.co.uk' as a Source Name

In [None]:
for index, row in data1.iterrows():
    
    if (row.Company != '.') & (row.Category == 'PR, Advertising & Marketing Jobs') & (row.Location == 'UK') & (row.SourceName == 'jobstoday.co.uk'):
        tempCompany = row.Company
        
    if row.Company == '.':
        data1.set_value(index,'Company',tempCompany)
#data1[data1['Company'] == '.']

###### Making basic changes in the whole column

In [None]:
# Convert Company to upper case
data1['Company'] = data1['Company'].str.upper()
# Replace double spaces from a single space
data1['Company'] = data1['Company'].str.replace('  ',' ')
# Again replace double spaces from a single space
data1['Company'] = data1['Company'].str.replace('  ',' ')
# Remove leading and trailing spaces
data1['Company'] = data1['Company'].str.strip()
# Remove special characters at the start
data1['Company'] = data1['Company'].str.lstrip('.,;-:')
# Remove special characters at the end
data1['Company'] = data1['Company'].str.rstrip(',;-:')
# Again remove leading and trailing spaces
data1['Company'] = data1['Company'].str.strip()
# Replace LIMITED with LTD
data1['Company'] = data1['Company'].str.replace('LIMITED','LTD')
# Replace LIMITED. with LTD
data1['Company'] = data1['Company'].str.replace('LIMITED.','LTD')
# Replace LTD. with LTD
data1['Company'] = data1['Company'].str.replace('LTD.','LTD')
company1 = data1['Company'].value_counts()
company1 = company1.sort_index()
#company1

In [None]:
data1['Company'].value_counts().sort_index().head()

### Title

In [None]:
#data1['Title'].value_counts().sort_index().head()

###### Making basic changes in the whole column

In [None]:
# Remove leading and trailing spaces
data1['Title'] = data1['Title'].str.strip()
# Remove special characters at the start
data1['Title'] = data1['Title'].str.lstrip('.,;-:')
# Remove special characters at the end
data1['Title'] = data1['Title'].str.rstrip(',;-:')
# Again remove leading and trailing spaces
data1['Title'] = data1['Title'].str.strip()

###### Length of New Data

In [None]:
new_length = len(data1)
print("Original Length:", original_length, "\t", "New Length:", new_length)

## 4. Converting dataframe into CSV file format

In [None]:
data1.to_csv('./dataset1_solution.csv',encoding='utf-8')

## Summary
* Lexical errors : Location
* Irregularities : 

## References
* Tutorial Week 4