In [123]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 

In [124]:
df = pd.read_csv('salary_data_clean.csv')

In [125]:
# AvgSalary and MinSalary has nan values

df.columns[df.isna().any()].tolist()

['MinSalary', 'AvgSalary']

In [126]:
df.loc[:, df.isna().any()]

Unnamed: 0,MinSalary,AvgSalary
0,58.0,74.0
1,58.0,74.0
2,58.0,74.0
3,58.0,74.0
4,58.0,74.0
...,...,...
75,80.0,93.5
76,80.0,93.5
77,80.0,93.5
78,80.0,93.5


In [127]:
df.shape

(80, 30)

In [128]:
# Dropping all the NAN values so the model can make a prediction based on given data

df = df.dropna()

In [129]:
df.shape

(50, 30)

In [130]:
# Loading in data now with the nan values removed

df_model = df[['AvgSalary','Rating', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Hourly', 'Province', 'Age', 'Python', 'Spark', 'AWS', 'Excel', 'JobSimplified', 'Seniority', 'DescLength']]

In [131]:
# Get dummy data for model (either One Hot Encoder of Get_dummies works)

df_dum = pd.get_dummies(df_model)

In [132]:
# Train Test Split into 80/20

from sklearn.model_selection import train_test_split

In [133]:
# Create X and y variables

X = df_dum.drop('AvgSalary', axis=1)
y = df_dum.AvgSalary.values

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Multiple Linear Regression Model

In [135]:
import statsmodels.api as sm

In [136]:
# Create stats model

X_sm = X = sm.add_constant(X)
model = sm.OLS(y,X_sm)
model.fit().summary()

0,1,2,3
Dep. Variable:,y,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,2.481e+21
Date:,"Thu, 10 Sep 2020",Prob (F-statistic):,1.59e-11
Time:,15:13:23,Log-Likelihood:,1144.6
No. Observations:,50,AIC:,-2191.0
Df Residuals:,1,BIC:,-2097.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,48.1140,2.88e-10,1.67e+11,0.000,48.114,48.114
Rating,-2.5425,7.69e-11,-3.31e+10,0.000,-2.543,-2.543
Hourly,2.609e-11,6.01e-22,4.34e+10,0.000,2.61e-11,2.61e-11
Age,-0.2416,6.71e-12,-3.6e+10,0.000,-0.242,-0.242
Python,0.7729,3.19e-10,2.42e+09,0.000,0.773,0.773
Spark,29.8339,7.24e-10,4.12e+10,0.000,29.834,29.834
AWS,15.5792,4.93e-10,3.16e+10,0.000,15.579,15.579
Excel,-15.5581,1.67e-10,-9.31e+10,0.000,-15.558,-15.558
DescLength,-3.111e-06,6.27e-14,-4.96e+07,0.000,-3.11e-06,-3.11e-06

0,1,2,3
Omnibus:,15.18,Durbin-Watson:,1.573
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.944
Skew:,-1.239,Prob(JB):,0.000209
Kurtosis:,4.412,Cond. No.,1e+16
