In [1]:
import pandas as pd 
import numpy as np

# combining the 2 datasets in preparation of ft engineering

In [2]:
def get_combined_data():
    # reading train data
    train = pd.read_csv("train_v2.csv", sep=';',encoding='latin')
    
    # reading test data
    test = pd.read_csv("test_v2.csv", sep=';',encoding='latin')

    # extracting and then removing the targets from the training data 
    targets = train.SalaryNormalized
    train.drop('SalaryNormalized',1,inplace=True)
    

    # merging train data and test data for future feature engineering
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop('index',inplace=True,axis=1)
    
    return combined

In [3]:
combined = get_combined_data()

In [4]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 50)

In [5]:
combined

Unnamed: 0,Id,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
5,13179816,Engineering Systems Analyst Water Industry,Engineering Systems Analyst Water Industry Loc...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
6,14131336,Senior Subsea Pipeline Integrity Engineer,A globally renowned engineering and training c...,UK,,permanent,Indigo 21 Ltd,Engineering Jobs,cv-library.co.uk
7,14663196,RECRUITMENT CONSULTANT INDUSTRIAL / COMMERCIA...,THIS IS A LIVE VACANCY NOT A GENERIC ADVERTISE...,Manchester,,permanent,Code Blue Recruitment,HR & Recruitment Jobs,cv-library.co.uk
8,14663197,RECRUITMENT CONSULTANT CONSTRUCTION / TECHNIC...,This is an exceptional opportunity to join a c...,Leeds,,permanent,Code Blue Recruitment,HR & Recruitment Jobs,cv-library.co.uk
9,15395797,Subsea Cables Engineer,A subsea engineering company is looking for an...,Aberdeen,,permanent,Indigo 21 Ltd,Engineering Jobs,cv-library.co.uk


In [6]:
#we indeed have the 10000 obs from train + 5000 from test. 
#We now have 9 coloumns because salarynormalized has been dropped
combined.shape

(15000, 9)

### exploring the data and finding the adapted ft engineering to do

In [7]:
#looking at the modes of the variable Contract Timme. There is a lot of missing values
pd.Series.value_counts(combined['ContractTime'], dropna=False) 

permanent    7642
NaN          6075
contract     1283
Name: ContractTime, dtype: int64

In [8]:
#looking at the modes of the variable Title
pd.Series.value_counts(combined['Title'], dropna=False)

Staff Nurse                                                 78
Home Manager                                                64
Project Manager                                             46
Business Development Manager                                43
Dental Nurse                                                39
Deputy Manager                                              38
Sales Manager                                               25
Mechanical Design Engineer                                  24
Design Engineer                                             24
Sales Executive                                             23
                                                            ..
Supervising Social Worker Powys / Ceredigion                 1
Procurement Brokerage Buyer                                  1
Contract SharePoint Developer                                1
HGV Mechanic / LGV Commercial Vehicle Technician             1
Senior / Lead Mobile Applications Developer Egham Londo

In [9]:
#looking at the modes of the variable Company
pd.Series.value_counts(combined['Company'], dropna=False)

NaN                                     4340
JOBG8                                    477
Jobsite Jobs                             282
Fresh Partnership                        262
ARRAY                                    222
UKStaffsearch                            197
Chef Results                             146
Clear Selection                          132
Triumph Consultants                       99
JHR                                       75
                                        ... 
5Plus Recruitment Ltd                      1
Change Scotland                            1
Bvocal                                     1
Macmillan Davies Hodes.                    1
East Cambridgeshire District Council       1
Huntswood CTC Ltd                          1
Vertu Motors Plc                           1
HR GO plc                                  1
Mofilm                                     1
Russell Taylor  Ltd                        1
Name: Company, dtype: int64

In [10]:
#finding the null value of the variable Title in prepartion for tfidf.
combined.loc[combined.Title.isnull(),]

Unnamed: 0,Id,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName
1588,48271669,,Quality Improvement Manager North West England...,Liverpool,full_time,,,Healthcare & Nursing Jobs,careworx.co.uk


In [11]:
#replacing the only missing value in Title with the 3 first words in FullDescription
combined.loc[combined.Title.isnull(),"Title"]="quality improvement manager"

In [12]:
combined

Unnamed: 0,Id,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...",Surrey,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
5,13179816,Engineering Systems Analyst Water Industry,Engineering Systems Analyst Water Industry Loc...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk
6,14131336,Senior Subsea Pipeline Integrity Engineer,A globally renowned engineering and training c...,UK,,permanent,Indigo 21 Ltd,Engineering Jobs,cv-library.co.uk
7,14663196,RECRUITMENT CONSULTANT INDUSTRIAL / COMMERCIA...,THIS IS A LIVE VACANCY NOT A GENERIC ADVERTISE...,Manchester,,permanent,Code Blue Recruitment,HR & Recruitment Jobs,cv-library.co.uk
8,14663197,RECRUITMENT CONSULTANT CONSTRUCTION / TECHNIC...,This is an exceptional opportunity to join a c...,Leeds,,permanent,Code Blue Recruitment,HR & Recruitment Jobs,cv-library.co.uk
9,15395797,Subsea Cables Engineer,A subsea engineering company is looking for an...,Aberdeen,,permanent,Indigo 21 Ltd,Engineering Jobs,cv-library.co.uk


In [13]:
#looking at the modes of the variable contract type. There is a lot of missing values
pd.Series.value_counts(combined['ContractType'], dropna=False) 

NaN          10636
full_time     3736
part_time      628
Name: ContractType, dtype: int64

In [14]:
pd.Series.value_counts(combined['LocationNormalized'], dropna=False) 
#looking at the frequencies of each town

UK                   2515
London               1730
Manchester            363
The City              335
Leeds                 298
Belfast               240
South East London     215
Birmingham            202
Surrey                182
Bristol               171
                     ... 
Ellesmere               1
St. Austell             1
Wallsend                1
Watton                  1
Wrekin                  1
Saltash                 1
Carterton               1
Gorton                  1
Lanark                  1
Moor Row                1
Name: LocationNormalized, dtype: int64

# Beginning of actual Feature Engineering

In [15]:
ContractType_dummies = pd.get_dummies(combined['ContractType'],dummy_na=True, prefix='contract_type') 
Category_dummies = pd.get_dummies(combined['Category'], prefix='category')
ContractTime_dummies = pd.get_dummies(combined['ContractTime'],dummy_na=True, prefix='contract_time')
#créer les variables muettes pour contract time, contract type et category prenant en compte les valeurs nulles

In [16]:
#adding them to the dataframe combined
combined = pd.concat([combined, ContractType_dummies], axis=1)
combined = pd.concat([combined, ContractTime_dummies], axis=1)
combined = pd.concat([combined, Category_dummies], axis=1)

In [17]:
#checking we got the new columns
combined.shape

(15000, 44)

In [18]:
#creating dummies for each town with more than 1% of the obs, and less than 10% of all obs (to exclude UK) 
counts_city_prop = pd.Series.value_counts(combined['LocationNormalized'], dropna=False) / len(combined['LocationNormalized'])
mask = combined['LocationNormalized'].isin(counts_city_prop[counts_city_prop > 0.01].index & counts_city_prop[counts_city_prop < 0.15].index)
LocationNormalized_Copy = combined['LocationNormalized'].copy()
LocationNormalized_Copy[~mask] = "Other_town"
Town_dummies = pd.get_dummies(LocationNormalized_Copy,prefix='dm')

In [19]:
combined = pd.concat([combined, Town_dummies], axis=1)
#adding the dummies from the previously selected towns

In [20]:
combined.head(3)


Unnamed: 0,Id,Title,FullDescription,LocationNormalized,ContractType,ContractTime,Company,Category,SourceName,contract_type_full_time,contract_type_part_time,contract_type_nan,contract_time_contract,contract_time_permanent,contract_time_nan,category_Accounting & Finance Jobs,category_Admin Jobs,category_Charity & Voluntary Jobs,category_Consultancy Jobs,category_Creative & Design Jobs,category_Customer Services Jobs,category_Domestic help & Cleaning Jobs,"category_Energy, Oil & Gas Jobs",category_Engineering Jobs,category_Graduate Jobs,...,category_Legal Jobs,category_Logistics & Warehouse Jobs,category_Maintenance Jobs,category_Manufacturing Jobs,category_Other/General Jobs,"category_PR, Advertising & Marketing Jobs",category_Part time Jobs,category_Property Jobs,category_Retail Jobs,category_Sales Jobs,category_Scientific & QA Jobs,category_Social work Jobs,category_Teaching Jobs,category_Trade & Construction Jobs,category_Travel Jobs,dm_Belfast,dm_Birmingham,dm_Bristol,dm_Leeds,dm_London,dm_Manchester,dm_Other_town,dm_South East London,dm_Surrey,dm_The City
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,Dorking,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,Glasgow,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,Hampshire,,permanent,Gregory Martin International,Engineering Jobs,cv-library.co.uk,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [21]:
# let us drop the features we dont need anymore : 
combined = combined.drop(['FullDescription', 'LocationNormalized', 'ContractType', 'ContractTime', 'Company', 'Category', 'SourceName'],axis=1)

In [22]:
combined.shape

(15000, 47)

In [23]:
combined.head(3)

Unnamed: 0,Id,Title,contract_type_full_time,contract_type_part_time,contract_type_nan,contract_time_contract,contract_time_permanent,contract_time_nan,category_Accounting & Finance Jobs,category_Admin Jobs,category_Charity & Voluntary Jobs,category_Consultancy Jobs,category_Creative & Design Jobs,category_Customer Services Jobs,category_Domestic help & Cleaning Jobs,"category_Energy, Oil & Gas Jobs",category_Engineering Jobs,category_Graduate Jobs,category_HR & Recruitment Jobs,category_Healthcare & Nursing Jobs,category_Hospitality & Catering Jobs,category_IT Jobs,category_Legal Jobs,category_Logistics & Warehouse Jobs,category_Maintenance Jobs,category_Manufacturing Jobs,category_Other/General Jobs,"category_PR, Advertising & Marketing Jobs",category_Part time Jobs,category_Property Jobs,category_Retail Jobs,category_Sales Jobs,category_Scientific & QA Jobs,category_Social work Jobs,category_Teaching Jobs,category_Trade & Construction Jobs,category_Travel Jobs,dm_Belfast,dm_Birmingham,dm_Bristol,dm_Leeds,dm_London,dm_Manchester,dm_Other_town,dm_South East London,dm_Surrey,dm_The City
0,12612628,Engineering Systems Analyst,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,12612830,Stress Engineer Glasgow,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,12612844,Modelling and simulation analyst,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [24]:
#using tftdf to get a matrix of words used. 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2),min_df=30)
x=tfidf_vectorizer.fit_transform(combined['Title'])

In [25]:
#matrix with the 457 words or 2-words selected. 
x

<15000x457 sparse matrix of type '<class 'numpy.float64'>'
	with 50258 stored elements in Compressed Sparse Row format>

In [26]:
#we don't need this column anymore
combined = combined.drop('Title',axis=1)


In [27]:
combined.head(3)

Unnamed: 0,Id,contract_type_full_time,contract_type_part_time,contract_type_nan,contract_time_contract,contract_time_permanent,contract_time_nan,category_Accounting & Finance Jobs,category_Admin Jobs,category_Charity & Voluntary Jobs,category_Consultancy Jobs,category_Creative & Design Jobs,category_Customer Services Jobs,category_Domestic help & Cleaning Jobs,"category_Energy, Oil & Gas Jobs",category_Engineering Jobs,category_Graduate Jobs,category_HR & Recruitment Jobs,category_Healthcare & Nursing Jobs,category_Hospitality & Catering Jobs,category_IT Jobs,category_Legal Jobs,category_Logistics & Warehouse Jobs,category_Maintenance Jobs,category_Manufacturing Jobs,category_Other/General Jobs,"category_PR, Advertising & Marketing Jobs",category_Part time Jobs,category_Property Jobs,category_Retail Jobs,category_Sales Jobs,category_Scientific & QA Jobs,category_Social work Jobs,category_Teaching Jobs,category_Trade & Construction Jobs,category_Travel Jobs,dm_Belfast,dm_Birmingham,dm_Bristol,dm_Leeds,dm_London,dm_Manchester,dm_Other_town,dm_South East London,dm_Surrey,dm_The City
0,12612628,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,12612830,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,12612844,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [28]:
combined.shape

(15000, 46)

In [29]:
#making a dataframe from what we obtained
description_matrix = pd.DataFrame(x.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [30]:
description_matrix

Unnamed: 0,2nd,aberdeen,account,account executive,account manager,accountant,accounts,accounts assistant,administrator,adult,adults,advisor,advisors,agency,agent,agile,analyst,application,architect,area,asbestos,aspnet,assessor,assistant,assistant job,...,training,travel,travel consultant,turner,uk,unit,unit manager,urgent,vehicle,venue,wanted,water,web,web developer,west,west london,windows,winning,work,worker,worker job,worker jobs,workers,year,yorkshire
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.373401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.591948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
description_matrix.sum().sort_values

<bound method Series.sort_values of 2nd                    17.601521
aberdeen               18.057897
account               140.852512
account executive      23.443130
account manager       100.586090
accountant             85.042246
accounts               54.193251
accounts assistant     20.162357
administrator         179.242933
adult                  28.072622
                         ...    
west london            15.873690
windows                22.403263
winning                15.002031
work                   33.036466
worker                205.137009
worker job             64.125982
worker jobs            31.944116
workers                72.630530
year                   35.049066
yorkshire              27.332231
dtype: float64>

In [32]:
#adding our matrix obtained with tfidf to the combined dataframe
combined = pd.concat([combined, description_matrix], axis=1)

In [33]:
combined.shape



(15000, 503)

In [34]:
#getting rid of the id because this variable wont help to predict the test dataset. 
combined = combined.drop('Id',axis=1)

In [35]:
combined.head(3)

Unnamed: 0,contract_type_full_time,contract_type_part_time,contract_type_nan,contract_time_contract,contract_time_permanent,contract_time_nan,category_Accounting & Finance Jobs,category_Admin Jobs,category_Charity & Voluntary Jobs,category_Consultancy Jobs,category_Creative & Design Jobs,category_Customer Services Jobs,category_Domestic help & Cleaning Jobs,"category_Energy, Oil & Gas Jobs",category_Engineering Jobs,category_Graduate Jobs,category_HR & Recruitment Jobs,category_Healthcare & Nursing Jobs,category_Hospitality & Catering Jobs,category_IT Jobs,category_Legal Jobs,category_Logistics & Warehouse Jobs,category_Maintenance Jobs,category_Manufacturing Jobs,category_Other/General Jobs,...,training,travel,travel consultant,turner,uk,unit,unit manager,urgent,vehicle,venue,wanted,water,web,web developer,west,west london,windows,winning,work,worker,worker job,worker jobs,workers,year,yorkshire
0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
#Now separating the train set and the test set
def recover_train_test_target():
    global combined
    
    train0 = pd.read_csv("train_v2.csv", sep=';',encoding='latin')
    
    targets = train0.SalaryNormalized
    train = combined.ix[0:9999]
    test = combined.ix[10000:14999]
    
    return train,test,targets

In [37]:
#applying the function
train,test,targets = recover_train_test_target()

In [38]:
train.shape #we indeed find again the 10 000 obs

(10000, 502)

In [39]:
test.shape

(5000, 502)

In [40]:
targets

0       25000
1       30000
2       30000
3       27500
4       25000
5       25000
6       75000
7       22000
8       23000
9       85000
        ...  
9990    60000
9991    60000
9992    47000
9993    55000
9994    55000
9995    52000
9996    60000
9997    50000
9998    47500
9999    29000
Name: SalaryNormalized, dtype: int64

In [41]:
targets.shape

(10000,)

In [42]:
test0 = pd.read_csv("test_v2.csv", sep=';',encoding='latin') #will be used later to get the Id and export it to our csv file. 

# Predctions

In [43]:
# importing all the functions we might need in our predictions

from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge
from sklearn.cross_validation import StratifiedKFold
from sklearn import grid_search
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.ensemble import GradientBoostingRegressor


In [44]:
X_train=train.values

In [45]:
X_test=test.values

## Cross Validation

In [46]:
cross_validation = KFold(n=1000,n_folds=10,shuffle=True)

## The models with grid_search

### Lasso

In [47]:
#in order to train our model, we will use gridsearch to get the best hyperparameters. 

parameters = {'alpha':[0.1, 1.0, 10.0, 20.0]}

model = Lasso()

gs_lasso = grid_search.GridSearchCV(
    model, 
    parameters,
    cv=cross_validation,
    verbose=1,
    scoring='mean_absolute_error')


gs_lasso.fit(X_train, targets)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    6.0s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=1000, n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 1.0, 10.0, 20.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=1)

In [48]:
#looking at the mae score
gs_lasso.best_score_

-5689.9682722467142

In [49]:
#looking at the best hyperparameters found by gridsearch
gs_lasso.best_estimator_

Lasso(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [50]:
#applying our model to predict the Salary of the table test
predictions_lasso = gs_lasso.predict(X_test)

In [51]:
#preparing the table which will be exported for submission to the Kaggle challenge. 
predictions_lasso
result_lasso=pd.DataFrame()
result_lasso["Id"]=test0["Id"]
result_lasso["PredictedSalary"]=predictions_lasso
result_lasso.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,29965.85426
1,72629930,38460.55286
2,72629937,50273.514907
3,72629938,33964.031061
4,72629944,29063.533443


In [52]:
result_lasso.to_csv("result_lasso.csv",index=False)

### Ridge

In [53]:
#same processus than the Lasso. 
parameters = {'alpha':[0.1, 1.0, 5.0, 10.0,]}

model = Ridge()

gs_ridge = grid_search.GridSearchCV(
    model, 
    parameters,
    cv=cross_validation,
    verbose=1,
    scoring='mean_absolute_error')


gs_ridge.fit(X_train, targets)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    3.8s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=1000, n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [0.1, 1.0, 5.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=1)

In [54]:
gs_ridge.best_score_

-5632.9614681163703

In [55]:
gs_ridge.best_estimator_

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [56]:
predictions_ridge = gs_ridge.predict(X_test)

In [57]:
predictions_ridge

array([ 28450.88649303,  39242.47218598,  53606.12920647, ...,
        37116.66821255,  30232.2719354 ,  29167.03787904])

In [60]:
result_ridge=pd.DataFrame()
result_ridge["Id"]=test0["Id"]
result_ridge["PredictedSalary"]=predictions_ridge
result_ridge.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,28450.886493
1,72629930,39242.472186
2,72629937,53606.129206
3,72629938,30011.927297
4,72629944,30169.495053


In [61]:
result_ridge.to_csv("result_ridge.csv",index=False)

In [74]:
result_ridge.shape

(5000, 2)

In [None]:
#9918.16917 Kaggle avec Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
                           #normalize=False, random_state=None, solver='auto', tol=0.001)

### Decision Tree

In [108]:
model = DecisionTreeRegressor(max_features='auto')


parameters = {
    'max_depth':[2, 10, None],
    'min_samples_split': [5, 10, 25]
    }

gs_tree = grid_search.GridSearchCV(
    model, 
    parameters,
    cv=cross_validation,
    verbose=1,
    scoring='mean_absolute_error' 
)
gs_tree.fit(X_train, targets)

Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    7.7s
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   16.1s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=1000, n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [5, 10, 25], 'max_depth': [2, 10, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_absolute_error',
       verbose=1)

In [109]:
gs_tree.best_score_

-5350.4319000580426

In [110]:
gs_tree.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [111]:
predictions_tree = gs_tree.predict(X_test)

In [112]:
predictions_tree

array([ 26500.        ,  35000.        ,  60416.66666667, ...,
        34023.5       ,  30725.        ,  31510.        ])

In [115]:
result_tree=pd.DataFrame()
result_tree["Id"]=test0["Id"]
result_tree["PredictedSalary"]=predictions_tree
result_tree.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,26500.0
1,72629930,35000.0
2,72629937,60416.666667
3,72629938,80000.0
4,72629944,29500.0


In [116]:
result_tree.to_csv("result_tree.csv",index=False)

### RadomForest

In [63]:
model = RandomForestRegressor(n_jobs=2,verbose=1)

cross_validation_rf = KFold(n=2000,n_folds=5,shuffle=True)

parameter_grid = {  
                 'max_depth' : [8,15,None],
                 'n_estimators': [10,100,200],
                 'min_samples_split':[2,3]
                 
                 }


gs_rf = grid_search.GridSearchCV(model,
                           param_grid=parameter_grid,
                           cv=cross_validation_rf
                           )

gs_rf.fit(X_train, targets) 

[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]

GridSearchCV(cv=sklearn.cross_validation.KFold(n=1000, n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=2, oob_score=False, random_state=None,
           verbose=1, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 100, 200], 'max_depth': [8, 15, None], 'min_samples_split': [2, 3]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [65]:
gs_rf.best_estimator_

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=2, oob_score=False, random_state=None,
           verbose=1, warm_start=False)

In [66]:
gs_rf.best_score_

0.62121724198483241

In [68]:
predictions_rf = gs_rf.predict(X_test)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


In [69]:
predictions_rf

array([ 34606.40756877,  35202.08333333,  57255.        , ...,
        35459.87083333,  31178.        ,  32632.28333333])

In [70]:
result_rf=pd.DataFrame()
result_rf["Id"]=test0["Id"]
result_rf["PredictedSalary"]=predictions_rf
result_rf.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,34606.407569
1,72629930,35202.083333
2,72629937,57255.0
3,72629938,31562.57
4,72629944,26611.445064


In [71]:
result_rf.to_csv("result_rf3.csv",index=False)

#Kaggle result = 11173.16157  with  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
                   # max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
                 # min_samples_split=2, min_weight_fraction_leaf=0.0,
                 # n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
                  #verbose=1, warm_start=False)
    
    
#Kaggle result = 9913.22709  with RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                              # max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
                              # min_samples_split=2, min_weight_fraction_leaf=0.0,
                              # n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
                               #verbose=1, warm_start=False)

### Gradient boosting

In [62]:
model = GradientBoostingRegressor(verbose=1)

cross_validation = KFold(n=1000,n_folds=10,shuffle=True)


parameters = {
     'max_depth':[3,5,10],
     'learning_rate': [0.1,0.1,1],
     'n_estimators': [10,100,200]
     }

gs_grad = grid_search.GridSearchCV(
      model, 
      parameters,
      cv=cross_validation,
      scoring='mean_absolute_error' 
 )
 
   
gs_grad.fit(X_train, targets)

      Iter       Train Loss   Remaining Time 
         1        8885.2167            0.05s
         2        8519.8507            0.05s
         3        8207.3732            0.04s
         4        7965.8997            0.03s
         5        7763.1948            0.03s
         6        7561.1833            0.02s
         7        7386.1030            0.02s
         8        7208.3661            0.01s
         9        7062.2629            0.01s
        10        6941.0609            0.00s
      Iter       Train Loss   Remaining Time 
         1        9174.2904            0.05s
         2        8804.2925            0.04s
         3        8542.2294            0.04s
         4        8255.0060            0.03s
         5        8053.2097            0.03s
         6        7857.4111            0.02s
         7        7697.1934            0.02s
         8        7560.3009            0.01s
         9        7385.6165            0.01s
        10        7231.9438            0.00s
      It

ValueError: learning_rate must be greater than 0 but was 0

In [96]:
gs_grad.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [95]:
gs_grad.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [91]:
predictions_grad = gs_grad.predict(X_test)

In [92]:
result_grad=pd.DataFrame()
result_grad["Id"]=test0["Id"]
result_grad["PredictedSalary"]=predictions_grad
result_grad.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,28877.068533
1,72629930,35602.96049
2,72629937,50509.783515
3,72629938,32996.096884
4,72629944,28121.649414


In [93]:
result_grad.to_csv("result_grad2.csv",index=False)

### Nearest Neighbors

In [73]:
from sklearn.neighbors import KNeighborsRegressor

In [77]:
model = KNeighborsRegressor()


parameters = {
     'n_neighbors':[3,5,10,20],
     }

gs_knn = grid_search.GridSearchCV(
      model, 
      parameters,
      cv=cross_validation,
      scoring='mean_absolute_error', 
      verbose=1
 )
 
   
gs_knn.fit(X_train, targets)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    3.1s finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=1000, n_folds=10, shuffle=True, random_state=None),
       error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [3, 5, 10, 20]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='mean_absolute_error', verbose=1)

In [79]:
gs_knn.best_score_

-6179.8456999999999

In [81]:
gs_knn.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=10, p=2,
          weights='uniform')

In [84]:
predictions_knn = gs_knn.predict(X_test)

In [85]:
result_knn=pd.DataFrame()
result_knn["Id"]=test0["Id"]
result_knn["PredictedSalary"]=predictions_knn
result_knn.head()

Unnamed: 0,Id,PredictedSalary
0,72629919,32782.0
1,72629930,35600.0
2,72629937,51500.0
3,72629938,36150.0
4,72629944,27933.6


In [86]:
result_knn.to_csv("result_knn.csv",index=False)