In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [21]:
train_df = pd.read_csv('./data/trainingset.csv')
test_df = pd.read_csv('./data/testset.csv')
train_df.head()

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cntryid,cntryid_e,age_r,gender_r,computerexperience,nativespeaker,edlevel3,monthlyincpr,yearlyincpr,lng_home,...,v85,v50,v89,v127,v239,v224,v71,v105,row,uni
0,Russian Federation,Russian Federation,42.0,Female,Yes,Yes,High,25 to less than 50,25 to less than 50,999,...,Agree,Agree,,,233.0,9996.0,8521,9996.0,95754,jw3580
1,United States,United States,,Male,Yes,Yes,High,75 to less than 90,75 to less than 90,999,...,Agree,Neither agree nor disagree,,,9999.0,9996.0,9999,9996.0,57367,jw3580
2,United States,United States,,Female,Yes,Yes,High,,,999,...,Strongly agree,Strongly agree,,,9996.0,9999.0,9996,9999.0,195554,jw3580
3,Czech Republic,Czech Republic,23.0,Female,Yes,Yes,Medium,10 to less than 25,10 to less than 25,ces,...,Agree,Agree,,,3311.0,9996.0,6419,9996.0,42148,jw3580
4,Canada,Canada (English),,Male,Yes,Yes,Medium,,,999,...,Disagree,Agree,,,9996.0,9999.0,9999,9999.0,86076,jw3580


# Sampling with Replacement for Gender Bias Reduction

In [95]:
vc_train = train_df['gender_r'].value_counts()
vc_train

Male      12495
Female     7505
Name: gender_r, dtype: int64

In [96]:
vc_test = test_df['gender_r'].value_counts()
vc_test

Female    10111
Male       9889
Name: gender_r, dtype: int64

In [97]:
len(test_df)

20000

In [99]:
# find weights
w_m = (vc_test[1] / len(test_df)) / (vc_train[0] / len(train_df))
w_f = (vc_test[0] / len(test_df)) / (vc_train[1] / len(train_df))
w_m, w_f

(0.791436574629852, 1.3472351765489676)

In [101]:
# Probability of drawing
sum_of_w = w_m + w_f
P_m = w_m / sum_of_w
P_f = w_f / sum_of_w
P_m, P_f

(0.37005986271320884, 0.6299401372867911)

In [112]:
# Build up a weights series
weights = train_df['gender_r'] == 'Male'
weights.replace(to_replace=True, value=P_m, inplace=True)
weights.replace(to_replace=False, value=P_f, inplace=True)
weights.shape

(20000,)

In [113]:
train_df = train_df.sample(n=20000, replace=True, weights=weights,
                           random_state=42)
train_df.head()

Unnamed: 0,cntryid,cntryid_e,age_r,gender_r,computerexperience,nativespeaker,edlevel3,monthlyincpr,yearlyincpr,lng_home,...,v85,v50,v89,v127,v239,v224,v71,v105,row,uni
7502,Russian Federation,Russian Federation,28.0,Female,Yes,Yes,High,25 to less than 50,25 to less than 50,999,...,Agree,Agree,,,1222.0,9996.0,731,9996.0,93061,jw3580
19002,Japan,Japan,35.0,Male,Yes,Yes,High,50 to less than 75,50 to less than 75,jpn,...,Strongly agree,Agree,,,4313.0,9996.0,7820,9996.0,7994,jw3580
14614,Italy,Italy,37.0,Male,Yes,Yes,Medium,75 to less than 90,75 to less than 90,999,...,Neither agree nor disagree,Agree,,,7543.0,9996.0,3020,9996.0,163582,jw3580
11968,United States,United States,,Male,Yes,Yes,High,50 to less than 75,50 to less than 75,999,...,Agree,Agree,,,9999.0,9996.0,9999,9996.0,156617,jw3580
3130,Russian Federation,Russian Federation,38.0,Female,Yes,Yes,High,Less than 10,Less than 10,999,...,Strongly agree,Strongly agree,,,1345.0,9996.0,851,9996.0,53585,jw3580


In [114]:
# Gender bias in sample removed
train_df['gender_r'].value_counts()

Female    10141
Male       9859
Name: gender_r, dtype: int64

In [115]:
# Dropping uni and v262 because the two columns have no infomration
train_df.drop(['uni', 'v262'], axis=1, inplace=True)
test_df.drop(['uni', 'v262'], axis=1, inplace=True)

# NaN processing with interpolation
train_df.fillna(train_df.mean(), inplace=True)
test_df.fillna(test_df.mean(), inplace=True)

# Pick out categorical columns
train_categ_df = train_df.select_dtypes(include=[np.object], exclude=[np.number])
train_df.drop(train_categ_df.columns, axis=1, inplace=True)
test_categ_df = test_df.select_dtypes(include=[np.object], exclude=[np.number])
test_df.drop(test_categ_df.columns, axis=1, inplace=True)

# One-hot encode
train_categ_df = pd.get_dummies(train_categ_df, dummy_na=True)
test_categ_df = pd.get_dummies(test_categ_df, dummy_na=True)
train_df = pd.concat([train_df, train_categ_df], axis=1)
test_df = pd.concat([test_df, test_categ_df], axis=1)
test_df.columns

# Divide into X and Y (for submission)
X_train_final = train_df.drop('job_performance', axis=1) # Fit on train_final's
y_train_final = train_df['job_performance']
X_test_final = test_df.drop('job_performance', axis=1) # and predict on X_test_final for submission
y_test_final = test_df['job_performance'] # Empty

# Data Exploration for Biases

In [116]:
diff_cols1 = set(train_categ_df.columns) - set(test_categ_df.columns)

diff_cols2 = set(test_categ_df.columns) - set(train_categ_df.columns)

diff_cols = diff_cols1 | diff_cols2

train_categ_values = []
test_categ_values = []
for col in train_categ_df:
    if col not in diff_cols:
        train_categ_values.append(train_categ_df[col].value_counts(normalize=True))
for col in test_categ_df:
    if col not in diff_cols:
        test_categ_values.append(test_categ_df[col].value_counts(normalize=True))

diffs = np.abs(np.array(train_categ_values) - np.array(test_categ_values))

diffs_avg = []
for i, diff in enumerate(diffs):
    diffs_avg.append((i, diff.mean()))

diffs_avg.sort(key=lambda tup: tup[1],reverse=True)
indices = np.array(diffs_avg[:10])[:,0]

# for index in indices:
#     print(diffs[int(index)])

For now, it seems that there isn't a very significant difference between train set and test set. Above code finds 50 most biased features after one-hot encoding, but it only finds nan's and survey answers. Gender has around 0.20 difference, which is quite significant, so I do sampling with replacemnt correction for gender only.

# Pipeline for Model Selection and Tuning
Splitting train_df into train and test because only train_df has true labels to check accuracy

In [118]:
from sklearn.model_selection import train_test_split

df = train_df[:]
X = train_df.drop('job_performance', axis=1)
y = train_df['job_performance']
print(X.shape, y.shape)

(20000, 3176) (20000,)


In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, y_test.shape

((14000, 3176), (6000,))

In [120]:
def evaluate_MSE(MSE):
    '''Evaluates MSE based on HW5 problem statement'''
    near_upper = 164100
    near_lower = 44900
    unit = 1950
    
    if MSE > near_upper:
        print(f'MSE of {MSE} exceeds the upper bound for 90% range')
    elif MSE > near_lower:
        print(f'MSE of {MSE} is within 90% range for grade')
    else:
        ec = int((near_lower - MSE) / unit)
        print(f'MSE of {MSE} qualifies for {ec}% of EC')

# Create Baseline Model
Basic OLS regression

In [121]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

0.7773901659430359

In [122]:
y_true = y_test[:]
y_pred = reg.predict(X_test)
y_pred.shape

(6000,)

In [123]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_true, y_pred)
MSE

81707.97250223329

In [124]:
evaluate_MSE(MSE)

MSE of 81707.97250223329 is within 90% range for grade
