In [1]:
import numpy as np 
import pandas as pd 
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.feature_extraction import DictVectorizer


df_orig = pd.read_csv('salary-train.csv')
# Test model
df_test = pd.read_csv('salary-test-mini.csv')
df_orig.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


### Preprocessing

In [2]:
# Will do all operation with the copy of original data frame
df = df_orig
# Change all text to lower case
def preproc(df, col_ind):
    """
    Preprocessing stage
    df -- DataFrame
    col_ind -- indices of the columns one wants to transform
    """
    # ----- Preprocessing -----
    for ii in col_ind:
        df[df.columns[ii]] = df[df.columns[ii]].str.lower()
        df[df.columns[ii]] = df[df.columns[ii]].replace('[^a-zA-Z0-9]', ' ', regex=True)

    # replace Nan/null to 'nan'
    df['LocationNormalized'].fillna('nan', inplace=True)
    df['ContractTime'].fillna('nan', inplace=True)

    return df

df_train = preproc(df, [0, 1, 2])
df_test = preproc(df_test, [0, 1, 2])

In [4]:
# ----- Transform of the text features to number features -----
# tf-idf for description column
vectorizer = TfidfVectorizer(min_df=5)
X_1_train = vectorizer.fit_transform(df_train['FullDescription'])
X_1_test = vectorizer.transform(df_test['FullDescription'])
# One-hot for location and contarct time
dict_vec = DictVectorizer()
X_2_train = dict_vec.fit_transform(df_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_2_test = dict_vec.transform(df_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
# hstack two matrices 
X_train = hstack([X_1_train, X_2_train])
X_test = hstack([X_1_test, X_2_test])


In [7]:
# Train the model
y = df['SalaryNormalized']
regr = Ridge(alpha=1, random_state=241)
regr.fit(X_train, y)
# Make a prediction
y_test = regr.predict(X_test)
print(y_test)

[56576.85843852 37136.30172263]


In [25]:
# # Save in file
# txt_file = open('salary_ans.txt', 'w')
# for sal in y_test:
#     txt_file.write('%.2f ' % sal)
# txt_file.close()