In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data

In [1]:
train_feature_df = pd.read_csv(r'C:\Users\16129\Desktop\data\train_features.csv')
train_target_df = pd.read_csv(r'C:\Users\16129\Desktop\data\train_salaries.csv')
test_feature_df = pd.read_csv(r'C:\Users\16129\Desktop\data\test_features.csv')

# Examine the data

In [2]:
train_feature_df.head(10)
test_feature_df.head(10)
train_target_df.head(10

# Use .info() to see length and dtypes

In [3]:
train_feature_df.info()
train_target_df.info()
test_feature_df.info()

# Check for duplicates

In [4]:
train_feature_df.duplicated().sum()
train_target_df.duplicated().sum()
test_feature_df.duplicated().sum()

# Identify numerical and categorical variables

In [6]:
train_feature_df.columns

In [7]:
numeric_cols = ['yearsExperience', 'milesFromMetropolis']

In [8]:
categorical_cols = ['jobId', 'companyId', 'jobType', 'degree', 'major', 'industry']

# Summarize numerical and categorical variables separately

In [9]:
train_feature_df.describe(include = [np.number])

In [10]:
train_feature_df.describe(include = ['O'])

# Merge features and targets into single df (optional: delete original dfs

In [11]:
# Merge the features and salaries on jobId, delete original file to save memory
train_df = pd.merge(train_feature_df, train_target_df, on='jobId')

In [None]:
del train_feature_df
del train_target_df

In [None]:
train_df.info()

In [12]:
train_df.head()

# Visualize target variable (salary)

In [None]:
plt.figure(figsize = (14, 6))
plt.subplot(1,2,1)
sns.boxplot(train_df.salary)
plt.subplot(1,2,2)
sns.distplot(train_df.salary, bins=20)
plt.show()

# Use IQR rule to identify potential outliers

In [None]:
stat = train_df.salary.describe()
print(stat)
IQR = stat['75%'] - stat['25%']
upper = stat['75%'] + 1.5 * IQR
lower = stat['25%'] - 1.5 * IQR
print('The upper and lower bounds for suspected outliers are {} and {}.'.format(upper, lower))

# Examine potential outliers

In [None]:
#check potential outlier below lower bound
train_df[train_df.salary < 8.5]

In [None]:
#check potential outlier above upper bound
train_df.loc[train_df.salary > 222.5, 'jobType'].value_counts()

In [None]:
# Check most suspicious potential outliers above upper bound
train_df[(train_df.salary > 222.5) & (train_df.jobType == 'JUNIOR')]

These entries with zero salary do not appear to be volunteer positions. We are confident that they are instances of missing/corrupt data and should be removed from the training set.

The high-salary potential outliers all appear to be legitimate data. Most roles are C-level executive roles and the junior positions are in industries that are well known for high salaries (oil, finance). We determine these entries to be legitimate and will not remove them.

In [None]:
# Remove data with zero salaries
train_df = train_df[train_df.salary > 8.5]

In [None]:
def plot_feature(df, col):
    '''
    Make plot for each features
    left, the distribution of samples on the feature
    right, the dependance of salary on the feature
    '''
    plt.figure(figsize = (14, 6))
    plt.subplot(1, 2, 1)
    if df[col].dtype == 'int64':
        df[col].value_counts().sort_index().plot()
    else:
        #change the categorical variable to category type and order their level by the mean salary
        #in each category
        mean = df.groupby(col)['salary'].mean()
        df[col] = df[col].astype('category')
        levels = mean.sort_values().index.tolist()
        df[col].cat.reorder_categories(levels, inplace=True)
        df[col].value_counts().plot()
    plt.xticks(rotation=45)
    plt.xlabel(col)
    plt.ylabel('Counts')
    plt.subplot(1, 2, 2)

    if df[col].dtype == 'int64' or col == 'companyId':
        #plot the mean salary for each category and fill between the (mean - std, mean + std)
        mean = df.groupby(col)['salary'].mean()
        std = df.groupby(col)['salary'].std()
        mean.plot()
        plt.fill_between(range(len(std.index)), mean.values-std.values, mean.values + std.values, \
                         alpha = 0.1)
    else:
        sns.boxplot(x = col, y = 'salary', data=df)
    
    plt.xticks(rotation=45)
    plt.ylabel('Salaries')
    plt.show()

In [None]:
plot_feature(train_df, 'companyId')

#### The salary is weakly associated with companies

In [None]:
plot_feature(train_df, 'jobType')

#### There is a clear positive correlation between job type and salary.

In [None]:
plot_feature(train_df, 'degree')

#### More advanced degrees tend to correspond to higher salaries.

In [None]:
plot_feature(train_df, 'major')

#### People with majors of engineering, business and math generally have higher salaries.

In [None]:
plot_feature(train_df, 'industry')

#### As for industries, oil, finance and web industries generally pay better.

In [None]:
plot_feature(train_df, 'yearsExperience')

#### In general, there is a clear correlation between salary and years of experience.

In [None]:
plot_feature(train_df, 'milesFromMetropolis')

#### In general, salaries decrease with the distance to metropolis.

In [None]:
def encode_label(df, col):
    #encode the categories using average salary for each category to replace label
    cat_dict ={}
    cats = df[col].cat.categories.tolist()
    for cat in cats:
        cat_dict[cat] = train_df[train_df[col] == cat]['salary'].mean()   
    df[col] = df[col].map(cat_dict)

In [None]:
for col in train_df.columns:
    if train_df[col].dtype.name == "category":
        encode_label(train_df, col)

In [None]:
# Correlations between selected features and response
# jobId is discarded because it is unique for individual
fig = plt.figure(figsize=(12, 10))
features = ['companyId', 'jobType', 'degree', 'major', 'industry', 'yearsExperience', 'milesFromMetropolis']
sns.heatmap(train_df[features + ['salary']].corr(), cmap='Blues', annot=True)
plt.xticks(rotation=45)
plt.show()

We see that jobType is most strongly correlated with salary, followed by degree, major, and yearsExperience.

Among the features, we see that degree and major have a strong degree of correlation and jobType has a moderate degree of correlation with both degree and major.