In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('Train_rev1.csv')
train.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk


By roughly analyzing this dataset, we cansee that job title and job description is too detail for us, also id, contract type and source name do not seems important in predicting salary, therefore we'll take LocationNormalized, Company, Category and SalaryNormalized for our model.

In [3]:
train = train[['LocationNormalized', 'Company', 'Category', 'SalaryNormalized']]
train.head()

Unnamed: 0,LocationNormalized,Company,Category,SalaryNormalized
0,Dorking,Gregory Martin International,Engineering Jobs,25000
1,Glasgow,Gregory Martin International,Engineering Jobs,30000
2,Hampshire,Gregory Martin International,Engineering Jobs,30000
3,Surrey,Gregory Martin International,Engineering Jobs,27500
4,Surrey,Gregory Martin International,Engineering Jobs,25000


Find whether there are null values in the dataset and drop these rows

In [4]:
train.isnull().sum()

LocationNormalized        0
Company               32430
Category                  0
SalaryNormalized          0
dtype: int64

In [5]:
train = train[pd.notnull(train['Company'])]
train.isnull().sum()

LocationNormalized    0
Company               0
Category              0
SalaryNormalized      0
dtype: int64

Do some exploratory analysis, see whether category of job will affect the salary or not

In [6]:
train['SalaryNormalized'].groupby(train['Category']).mean().sort_values(ascending= False)

Category
IT Jobs                             43231.369021
Legal Jobs                          43005.212452
Energy, Oil & Gas Jobs              41325.419586
Accounting & Finance Jobs           38805.478691
Consultancy Jobs                    37317.240734
Trade & Construction Jobs           37235.669006
Engineering Jobs                    35832.096816
Other/General Jobs                  35571.808676
Scientific & QA Jobs                35198.368372
PR, Advertising & Marketing Jobs    34341.789547
Creative & Design Jobs              33243.266355
Healthcare & Nursing Jobs           33035.927875
Retail Jobs                         32855.222168
Social work Jobs                    32394.637298
Property Jobs                       31948.148526
HR & Recruitment Jobs               31926.410417
Sales Jobs                          30510.096705
Charity & Voluntary Jobs            28693.309235
Graduate Jobs                       28506.341423
Teaching Jobs                       27881.538565
Manufacturi

Count the data of each company and seperate them into 3 categories: Large, Medium and Small.

In [7]:
train['Company'].groupby(train['Company']).count().sort_values(ascending=False)

Company
UKStaffsearch                         4997
CVbrowser                             2977
London4Jobs                           2345
Hays                                  1784
JAM Recruitment Ltd                   1122
Office Angels                          961
Jobsite Jobs                           932
Perfect Placement                      865
ARRAY                                  847
JOBG8                                  841
Matchtech Group plc.                   834
Penguin Recruitment                    752
Randstad                               748
Adecco                                 701
Michael Page Finance                   634
Adecco Group                           592
BMS Sales Specialists LLP              559
COREcruitment International            551
Page Personnel Finance                 536
Capita Resourcing                      495
Michael Page Sales                     486
Matchtech                              474
Rise Technical Recruitment Ltd         460
Exp

Split the location tree, replace the orginal location with the '2' column in the location dataframe.

In [8]:
location = pd.read_csv('Location_Tree.csv', header = None, names = ['location'])
location.head()

Unnamed: 0,location
0,UK~London~East London~Mile End
1,UK~London~East London~Shadwell
2,UK~London~East London~Spitalfields
3,UK~London~East London~Stepney
4,UK~London~East London~Wapping


In [9]:
location['1'], location['2'], location['3'], location['4'] = location['location'].str.split('~', 3).str
location

Unnamed: 0,location,1,2,3,4
0,UK~London~East London~Mile End,UK,London,East London,Mile End
1,UK~London~East London~Shadwell,UK,London,East London,Shadwell
2,UK~London~East London~Spitalfields,UK,London,East London,Spitalfields
3,UK~London~East London~Stepney,UK,London,East London,Stepney
4,UK~London~East London~Wapping,UK,London,East London,Wapping
5,UK~London~East London~Whitechapel,UK,London,East London,Whitechapel
6,UK~London~East London~Bethnal Green,UK,London,East London,Bethnal Green
7,UK~London~East London~Cambridge Heath,UK,London,East London,Cambridge Heath
8,UK~London~East London~Haggerston,UK,London,East London,Haggerston
9,UK~London~East London~Shoreditch,UK,London,East London,Shoreditch


In [10]:
location2 = location['2'].groupby(location['2']).count().sort_values(ascending= False)
len(location2) 

30

In [11]:
for loc in train["LocationNormalized"].unique():
    query_index = location[location["4"] == loc].index
    if len(query_index) == 0:
        query_index = location[location["3"] == loc].index
    if query_index.shape[0] > 0:
        train["LocationNormalized"] = train["LocationNormalized"].replace([loc], location.iloc[query_index[0]]["2"])

In [21]:
train["LocationNormalized"]

0               South East England
1                         Scotland
2               South East England
3               South East England
4               South East England
5               South East England
6                               UK
7               North West England
8         Yorkshire And The Humber
9                         Scotland
10                   East Midlands
11              South East England
12                            Avon
13                            Avon
14                   East Midlands
15              North East England
16              South East England
17                              UK
18                            Avon
19                   West Midlands
20                              UK
21                              UK
22              South East England
23              South East England
24              South East England
25              North East England
26              South East England
27              North East England
28              Nort

In [12]:
train['Company'].groupby(train['Company']).count().sort_values(ascending=False)

Company
UKStaffsearch                         4997
CVbrowser                             2977
London4Jobs                           2345
Hays                                  1784
JAM Recruitment Ltd                   1122
Office Angels                          961
Jobsite Jobs                           932
Perfect Placement                      865
ARRAY                                  847
JOBG8                                  841
Matchtech Group plc.                   834
Penguin Recruitment                    752
Randstad                               748
Adecco                                 701
Michael Page Finance                   634
Adecco Group                           592
BMS Sales Specialists LLP              559
COREcruitment International            551
Page Personnel Finance                 536
Capita Resourcing                      495
Michael Page Sales                     486
Matchtech                              474
Rise Technical Recruitment Ltd         460
Exp

In [13]:
companies_counts = train["Company"].value_counts()
companies_counts_dict = {}
for k,v in companies_counts.items():
    if v >0 and v < 500:
        companies_counts_dict[k] = "Small"
    elif v >=500 and v<1000:
        companies_counts_dict[k] = "Medium"
    else:
        companies_counts_dict[k] = "Large"
train.dropna(inplace = True)

train["Company"] = train["Company"].apply(lambda k: companies_counts_dict[k])
print(train["Company"].head())

0    Small
1    Small
2    Small
3    Small
4    Small
Name: Company, dtype: object


In [14]:
train_copy = train.copy()

In [15]:
train_copy = pd.get_dummies(train_copy, columns=["Company"])
train_copy.head()

Unnamed: 0,LocationNormalized,Category,SalaryNormalized,Company_Large,Company_Medium,Company_Small
0,South East England,Engineering Jobs,25000,0,0,1
1,Scotland,Engineering Jobs,30000,0,0,1
2,South East England,Engineering Jobs,30000,0,0,1
3,South East England,Engineering Jobs,27500,0,0,1
4,South East England,Engineering Jobs,25000,0,0,1


Turn Company and Category into dummy variables

In [16]:
train_copy = pd.get_dummies(train_copy, columns=["Category"])
train_copy.head()

Unnamed: 0,LocationNormalized,SalaryNormalized,Company_Large,Company_Medium,Company_Small,Category_Accounting & Finance Jobs,Category_Admin Jobs,Category_Charity & Voluntary Jobs,Category_Consultancy Jobs,Category_Creative & Design Jobs,...,"Category_PR, Advertising & Marketing Jobs",Category_Part time Jobs,Category_Property Jobs,Category_Retail Jobs,Category_Sales Jobs,Category_Scientific & QA Jobs,Category_Social work Jobs,Category_Teaching Jobs,Category_Trade & Construction Jobs,Category_Travel Jobs
0,South East England,25000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Scotland,30000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,South East England,30000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,South East England,27500,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,South East England,25000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Fit the data into a multiple linear regression model and use 10 fold cross-validation to score it.

In [17]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
X = train_copy.iloc[:, 2:]
Y = train_copy.iloc[:, 1]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)



In [18]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_std, Y_train)
prepro = model.predict(X_test_std)
model.score(X_test_std,Y_test)

  linalg.lstsq(X, y)


0.13649190737834727

In [19]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(model, X_train, Y_train, cv=10)
print('accuracy of each fold is: ')
print(scores)
print('cv accuracy is:', scores.mean())

accuracy of each fold is: 
[ 1.35223447e-01 -1.06645344e+15  1.33190906e-01  1.32650066e-01
  1.47178721e-01  1.45155874e-01  1.32477120e-01  1.30504346e-01
  1.37716491e-01  1.34001001e-01]
cv accuracy is: -106645343927035.84
