In [1]:
import numpy as np
import pandas as pd

import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import warnings
warnings. filterwarnings('ignore')

In [2]:
df = pd.read_csv("survey_results_public.csv")
df = df.rename({"ConvertedCompYearly": "Salary"}, axis=1)
df = df[["Country", "EdLevel", "YearsCodePro","Employment", "Salary"]]
df

Unnamed: 0,Country,EdLevel,YearsCodePro,Employment,Salary
0,,,,,
1,Canada,,,"Employed, full-time",
2,United Kingdom of Great Britain and Northern I...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5,"Employed, full-time",40205.0
3,Israel,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",17,"Employed, full-time",215232.0
4,United States of America,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",3,"Employed, full-time",
...,...,...,...,...,...
73263,Nigeria,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5,"Employed, full-time",
73264,United States of America,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",5,"Employed, full-time",
73265,United States of America,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",33,"Employed, full-time",
73266,United Kingdom of Great Britain and Northern I...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",31,"Employed, full-time",


In [3]:
df = df.dropna()
df.isnull().sum()

Country         0
EdLevel         0
YearsCodePro    0
Employment      0
Salary          0
dtype: int64

In [4]:
df = df.drop("Employment", axis=1)

In [5]:
def shorten_categories(categories, cutoff):
    categorical_map = {}
    for i in range(len(categories)) :
        if categories.values[i] >= cutoff :
            categorical_map[categories.index[i]] = categories.index[i]
        else :
            categorical_map[categories.index[i]] = 'Other'
    return categorical_map

In [6]:
country_map = shorten_categories(df.Country.value_counts(), 199)
df['Country'] = df['Country'].map(country_map)
df.Country.value_counts()

United States of America                                8684
Other                                                   4355
Germany                                                 2901
United Kingdom of Great Britain and Northern Ireland    2649
India                                                   2154
Canada                                                  1478
France                                                  1372
Brazil                                                  1279
Spain                                                    992
Poland                                                   973
Netherlands                                              897
Australia                                                830
Italy                                                    807
Sweden                                                   664
Russian Federation                                       533
Switzerland                                              520
Turkey                  

In [7]:
def clean_experience(x):
    if x ==  'More than 50 years':
        return 50
    if x == 'Less than 1 year':
        return 0.5
    return float(x)

df['YearsCodePro'] = df['YearsCodePro'].apply(clean_experience)

In [8]:
def clean_education(x):
    if 'Bachelor’s degree' in x:
        return 'Bachelor’s degree'
    if 'Master’s degree' in x:
        return 'Master’s degree'
    if 'Professional degree' in x or 'Other doctoral' in x:
        return 'Post grad'
    return 'Less than a Bachelors'

df['EdLevel'] = df['EdLevel'].apply(clean_education)

In [9]:
df = df[df['Salary'] <= 10000]
df = df[df['Salary'] <= 300000]

In [10]:
numeric_pipeline = Pipeline([('Scaler', StandardScaler())])
categorical_pipeline = Pipeline([('Encoder', OneHotEncoder())])
transformer = ColumnTransformer([('num', numeric_pipeline, ['YearsCodePro']), ('cat', categorical_pipeline, ['Country','EdLevel'])])

In [11]:
X = df.drop('Salary', axis=1)
y = df['Salary']

In [12]:
df

Unnamed: 0,Country,EdLevel,YearsCodePro,Salary
45,Other,Master’s degree,3.0,5124.0
268,Other,Bachelor’s degree,3.0,8244.0
486,Other,Bachelor’s degree,1.0,6216.0
552,Other,Post grad,3.0,7908.0
645,Bangladesh,Bachelor’s degree,6.0,8184.0
...,...,...,...,...
73020,"Iran, Islamic Republic of...",Master’s degree,2.0,2832.0
73047,France,Master’s degree,25.0,7932.0
73073,Other,Bachelor’s degree,2.0,5412.0
73075,India,Bachelor’s degree,4.0,1860.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=45)
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)
model = LinearRegression(n_jobs=-1)
model.fit(X_train,y_train)

In [14]:
y_hat = model.predict(X_test)
error = np.sqrt(mean_squared_error(y_test, y_hat))
print('${:,.02f}'.format(error))

$2,886.81


In [15]:
joblib.dump(model,'LinearRegression.joblib')

['LinearRegression.joblib']