In [73]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, coo_matrix
from sklearn.linear_model import Ridge

In [74]:
path_to_data = "C:/study/data/"

In [75]:
df_train = pd.read_csv(path_to_data + "salary-train.csv")
df_test = pd.read_csv(path_to_data + "salary-test-mini.csv")

In [76]:
df_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [77]:
df_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


#### Preprocessing of the texts field

In [78]:
df_train['FullDescription'] = df_train['FullDescription'].apply(lambda x: x.lower())
df_train['FullDescription'] = df_train['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
df_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355


#### Fullfilling empty fields 

In [79]:
df_train['LocationNormalized'].fillna('nan', inplace=True)
df_train['ContractTime'].fillna('nan', inplace=True)

In [80]:
df_train.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355


#### Coding of the categorical features

In [81]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(df_train[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test_categ = enc.transform(df_test[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [82]:
X_train_categ.shape

(60000, 1766)

In [83]:
X_test_categ.shape

(2, 1766)

In [84]:
vectorizer = TfidfVectorizer(min_df=5)
X = vectorizer.fit_transform(df_train['FullDescription'])
X.shape
X_test = vectorizer.transform(df_test['FullDescription'])

In [85]:
X_test.shape

(2, 22861)

#### Objects-features matrix F

In [86]:
F = hstack([X, X_train_categ])

In [87]:
y = df['SalaryNormalized']

#### Fitting ridge regression

In [88]:
clf = Ridge(alpha=1.0, random_state=241)

In [89]:
clf.fit(F, y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=241, solver='auto', tol=0.001)

#### Building predictions

In [90]:
F_test = hstack([X_test, X_test_categ])

In [93]:
a = clf.predict(F_test)

In [95]:
a = [round(i,2) for i in a]
a

[56567.06, 37140.51]