In [319]:
import string
import re
import csv
import numpy as np
import pandas as pd
import sklearn
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.linear_model import Ridge
from sklearn import pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [320]:
class LinFitTransformer(BaseEstimator, RegressorMixin): 
    
    def __init__(self, fieldname):
        self.fieldname = fieldname

    def fit(self, X, y):   
        self.X = X[self.fieldname].values.reshape(-1,1)
        self.y = y
        self.model = LinearRegression()
        self.model.fit(self.X, self.y)
        return self
        
    def predict(self, X):
        self.X = X[self.fieldname].values.reshape(-1,1)
        self.prediction = self.model.predict(self.X)
        return self.prediction

In [321]:
class CategTransformer(BaseEstimator, RegressorMixin):
    
    def __init__(self, fieldname):
        self.fieldname = fieldname

    def fit(self, X, y):   
        self.df = X
        self.X = X[self.fieldname].values.reshape(-1,1)
        self.y = y    
        self.means = self.df.groupby([self.fieldname])['target'].mean()
        self.new_df = self.df.join(self.means, on=self.fieldname,rsuffix='_mean')
        return self
        
    def predict(self, X):
        self.df = X
        self.X = X[self.fieldname].values.reshape(-1,1)
        self.new_df = self.df.join(self.means, on=self.fieldname,rsuffix='_mean')
        try:
            self.prediction = self.new_df['target_mean'].values.reshape(-1,1)
        except:
            self.prediction = self.new_df['target'].values.reshape(-1,1)
        return self.prediction

In [322]:
class FullModelTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, X):
        self.est = X
       
    def fit(self, X, y):
        self.est.fit(X, y)
        return self

    def transform(self, X):
        return self.est.predict(X)


In [323]:
transform_dict = {'job type':1, 'years exp': 1, 'location': 1, 
                  'degree': 1, 'major':1, 'industry':1}

all_features = pipeline.FeatureUnion([
  ('job type', FullModelTransformer(CategTransformer('jobtype'))),
  ('years exp', FullModelTransformer(LinFitTransformer('yearsexperience'))),
  ('location', FullModelTransformer(LinFitTransformer('milesfrommetropolis'))),
  ('degree', FullModelTransformer(CategTransformer('degree'))),
  ('major', FullModelTransformer(CategTransformer('major'))), 
  ('industry', FullModelTransformer(CategTransformer('industry')))
        ],
  transformer_weights=transform_dict)

k_union = pipeline.Pipeline([
 ("features", all_features),
 ('modelfit', KNeighborsRegressor(n_neighbors=3))
 #("linreg", LinearRegression(fit_intercept=True))
    ])

In [324]:
train_file = 'train_data.csv'
blind_file = 'test_features_2013-03-07.csv'

train_df = pd.read_csv(train_file)
blind_df  = pd.read_csv(blind_file)

In [325]:
print(train_df.shape)
print(blind_df.shape)

(1000000, 9)
(1000000, 8)


In [326]:
train_df.head(5)

Unnamed: 0,jobid,companyid,jobtype,degree,major,industry,yearsexperience,milesfrommetropolis,salary
0,JOB1362684407695,COMP20,JANITOR,HIGH_SCHOOL,NONE,EDUCATION,1,54,31
1,JOB1362684407911,COMP61,SENIOR,MASTERS,COMPSCI,FINANCE,14,38,165
2,JOB1362684407919,COMP50,JUNIOR,MASTERS,MATH,WEB,15,96,105
3,JOB1362684408027,COMP33,SENIOR,HIGH_SCHOOL,NONE,FINANCE,14,3,158
4,JOB1362684408130,COMP33,MANAGER,HIGH_SCHOOL,NONE,HEALTH,7,5,95


In [327]:
for col in blind_df.columns:
    blind_df.rename(columns={col: col.lower()}, inplace=True)

In [328]:
blind_df.head(5)

Unnamed: 0,jobid,companyid,jobtype,degree,major,industry,yearsexperience,milesfrommetropolis
0,JOB1362685407687,COMP33,MANAGER,HIGH_SCHOOL,NONE,HEALTH,22,73
1,JOB1362685407688,COMP13,JUNIOR,NONE,NONE,AUTO,20,47
2,JOB1362685407689,COMP10,CTO,MASTERS,BIOLOGY,HEALTH,17,9
3,JOB1362685407690,COMP21,MANAGER,HIGH_SCHOOL,NONE,OIL,14,96
4,JOB1362685407691,COMP36,JUNIOR,DOCTORAL,BIOLOGY,OIL,10,44


In [329]:
X_total = train_df[train_df['target'] > 0]
y_total = X_total['target']
print(X_total.shape)
print(y_total.shape)

(999995, 9)
(999995,)


In [330]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_total, y_total, test_size=0.2)

In [331]:
k_union.fit(X_train, y_train.values.reshape(-1,1))

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('job type', FullModelTransformer(X=None)), ('years exp', FullModelTransformer(X=None)), ('location', FullModelTransformer(X=None)), ('degree', FullModelTransformer(X=None)), ('major', FullModelTransformer(X=None)), ('industry', Ful...nkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform'))])

In [332]:
print k_union.score(X_train, y_train.values.reshape(-1,1))

0.839002519415


In [333]:
k_union.fit(X_test, y_test.values.reshape(-1,1))

Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('job type', FullModelTransformer(X=None)), ('years exp', FullModelTransformer(X=None)), ('location', FullModelTransformer(X=None)), ('degree', FullModelTransformer(X=None)), ('major', FullModelTransformer(X=None)), ('industry', Ful...nkowski',
          metric_params=None, n_jobs=1, n_neighbors=3, p=2,
          weights='uniform'))])

In [334]:
print k_union.score(X_test, y_test.values.reshape(-1,1))

0.838818761908


In [347]:
result = k_union.predict(blind_df)

In [336]:
blind_df.head(5)

Unnamed: 0,jobid,companyid,jobtype,degree,major,industry,yearsexperience,milesfrommetropolis
0,JOB1362685407687,COMP33,MANAGER,HIGH_SCHOOL,NONE,HEALTH,22,73
1,JOB1362685407688,COMP13,JUNIOR,NONE,NONE,AUTO,20,47
2,JOB1362685407689,COMP10,CTO,MASTERS,BIOLOGY,HEALTH,17,9
3,JOB1362685407690,COMP21,MANAGER,HIGH_SCHOOL,NONE,OIL,14,96
4,JOB1362685407691,COMP36,JUNIOR,DOCTORAL,BIOLOGY,OIL,10,44


In [348]:
blind_df['target'] = result
header = ["jobid", "target"]
blind_df.to_csv('test_target.csv', columns = header, index=False)

In [349]:
result = k_union.predict(X_total)

In [339]:
X_total['target_pred'] = result
header = ["jobid", "target_pred"]
X_total.to_csv('train_target_pred.csv', columns = header, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [340]:
import matplotlib
import matplotlib.pyplot as plt

In [341]:
x1 = X_total['target'].values.reshape(-1,1)
y1 = X_total['target_pred'].values.reshape(-1,1)
y2 = blind_df['target'].values.reshape(-1,1)
xline = [0,100,200,300]
yline = xline

In [342]:
plt.cla()
plt.clf()
plt.plot(x1[0], y1[0], 'ko', label='train target')
plt.plot(x1, y1, 'ko')
plt.plot(xline[0], yline[0], linewidth=5.0, color='b', label='line of perfect fit')
plt.plot(xline, yline, color='b',linewidth=5.0)
plt.xlabel('actual target')
plt.ylabel('predicted target')
plt.xlim([0,350])
plt.ylim([0,350])
plt.legend()
plt.savefig('target.png')

In [343]:
plt.cla()
plt.clf()
plt.hist(x1, normed=1, alpha=0.2, label='train actual target')
plt.hist(y1, normed=1, alpha=0.2, label='test predicted target')
plt.xlabel('target')
plt.ylabel('% of job listings')
plt.legend()
plt.savefig('target_hist.png')