In [151]:
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
import numpy as np

In [37]:
dataset = pd.read_csv("https://s3-us-west-2.amazonaws.com/jobseekersregression/steves_labels.tsv", sep='\t')

In [153]:
valuable_columns = [
    "Duration numeric",
    "Level numeric",
    "Size numeric",
    "Revenue numeric",
    "OrgSize numeric",
    "Quality of Network"
]

label_columns = [
    "Potential_success",
    "Increase_revenue"
]

for column in valuable_columns + label_columns:
    dataset[column] = pd.to_numeric(dataset[column])

In [154]:
features = dataset[valuable_columns]
labels = dataset[label_columns]
validation_size = 0.33
seed = 7

features_train, features_validation, labels_train, labels_validation =\
    cross_validation.train_test_split(features, labels, test_size=validation_size, random_state=seed)

In [155]:
linear_regression = LinearRegression()
linear_regression.fit(features_train, labels_train)
linear_regression.score(features, labels)

0.71957307345281873

In [156]:
weights = {
            "Duration numeric":
                {
                    "Less than 1 year": 1,
                    "1-2 years": 2,
                    "2+ years": 3
                },
            "Level numeric":
                {
                    "Title: Less than Director": 1,
                    "Title: Sr Director": 2,
                    "Title: VP or above": 3
                },
            "Size numeric":
                {
                    "Less than 400": 1,
                    "400-500": 2,
                    "500+": 3
                },
            "Revenue numeric":
                {
                    "Less than 10%": 1,
                    "Less than 20%": 2,
                    "21%+": 3
                },
            "OrgSize numeric":
                {
                    "100-500": 1,
                    "10-100": 2,
                    "500+": 3
                },
            "Quality of Network":
                {
                    1: 1,
                    3: 3,
                    5: 5
                }
        }

def quantify_features(candidates):
    for column in candidates:
        candidates[column] = candidates[column].apply(lambda x: weights[column][x])

In [157]:
bad_candidate = pd.DataFrame({
    "Duration numeric": ["Less than 1 year"],
    "Level numeric": ["Title: Less than Director"],
    "Size numeric": ["Less than 400"],
    "Revenue numeric": ["Less than 10%"],
    "OrgSize numeric": ["100-500"],
    "Quality of Network": 1
})

quantify_features(bad_candidate)
predictions = linear_regression.predict(bad_candidate)

print("Potential of success in role %% %f" % predictions[0][0])
print("Increase revenue in 12 months %% %f" % predictions[0][1])

Potential of success in role % 21.290186
Increase revenue in 12 months % -5.874767


In [158]:
medium_candidate = pd.DataFrame({
    "Duration numeric": ["1-2 years"],
    "Level numeric": ["Title: Sr Director"],
    "Size numeric": ["400-500"],
    "Revenue numeric": ["Less than 20%"],
    "OrgSize numeric": ["10-100"],
    "Quality of Network": 3
})

quantify_features(medium_candidate)
predictions = linear_regression.predict(medium_candidate)

print("Potential of success in role %% %f" % predictions[0][0])
print("Increase revenue in 12 months %% %f" % predictions[0][1])

Potential of success in role % 65.859530
Increase revenue in 12 months % 57.635717


In [159]:
good_candidate = pd.DataFrame({
    "Duration numeric": ["2+ years"],
    "Level numeric": ["Title: VP or above"],
    "Size numeric": ["500+"],
    "Revenue numeric": ["21%+"],
    "OrgSize numeric": ["500+"],
    "Quality of Network": 5
})

quantify_features(good_candidate)
predictions = linear_regression.predict(good_candidate)

print("Potential of success in role %% %f" % predictions[0][0])
print("Increase revenue in 12 months %% %f" % predictions[0][1])

Potential of success in role % 110.428874
Increase revenue in 12 months % 121.146200
