In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline


## Loading Libraries

In [231]:
#primary libraries python
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

#SKlearn and some models
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import classification_report,accuracy_score,mean_squared_error,precision_score,recall_score,roc_auc_score
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,AdaBoostClassifier
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.utils import resample

#catboost
from catboost import CatBoostClassifier as CBC

#SMOTE and resampling 
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek,SMOTEENN
from imblearn.under_sampling import ClusterCentroids

#xgboost
from xgboost import XGBClassifier as xgbc
import xgboost

lb = LabelEncoder() 
plt.rcParams['figure.figsize'] = (20,8)
pd.options.display.max_columns = 100

### Function to read data

In [3]:
def read_data(locations):
    if locations == None:
        print('Please enter a valid Location for the data')
        return 0
    elif str.find(locations,'.csv') >=0:
        return pd.read_csv(locations,error_bad_lines=False)
        
    elif str.find(locations,'.xlsx') >=0:
        return pd.read_excel(locations,error_bad_lines= False)
    else:
        print('Data type not supported')
        return 0

### Function to remove nan from data

In [4]:
def remove_nan(data, indexes):
    for i in indexes:
        data.loc[np.isnan(data[i]),i] = np.nanmedian(data[i])
    return data

### Function to detect numeric data attribute within a dataset

In [5]:
def detect_numerals(data):
    num = data.dtypes.index[(data.dtypes == 'int64')| (data.dtypes == 'int32')|(data.dtypes == 'float64')].tolist()
    if len(data.isnull().sum()[data.isnull().sum() > 0].index.tolist()) > 0:
        indexes = data.isnull().sum()[data.isnull().sum() > 0].index.tolist()
        data = remove_nan(data,indexes)
    non_num = list()
    for i in num:
        if np.unique(data[i]).shape[0] <= 10:
            non_num.append(i)
        
    return set(num) - set(non_num),data

### Function to plot important features

In [6]:
def plot_imp_feat(cols, feat_imp):
    '''
    col = Column names used to predict the label
    feat_imp = Model feature importance data or matrix
    This function plots the feature importance of the dataset's attributes.
    '''
    feat = pd.DataFrame()
    feat['features'] = cols
    feat['importance'] = feat_imp
    feat = feat.sort_values(by = 'importance')
    
    plt.barh(feat.features,feat.importance)

### Encoding variables

In [7]:
def convert(df, columns):
    for i in columns:
        df[i] = lb.fit_transform(df[i])
    return df

### Function for submission

In [8]:
def make_submission(ids,preds,col_names,name = 'submission.csv'):
    '''
    ids = id column of submission dataset
    preds = predictions made by the model
    col_names = A list of length 2 which consists the column names of submission file
    name = Name of the submission file
    '''
    if len(ids) != len(preds):
        raise ValueError('Ids and predictions lengths are not same')
        
    submission = pd.DataFrame()
    submission[col_names[0]] = ids
    submission[col_names[1]] = preds

    submission.to_csv(f'./{name}', index = False)

### Function to predict missing values through RandomForest

In [9]:
def rf_classifier(x_train,y_train,x_test,n_est= 500, m_dep = 10):
    '''
    x_train: training_data
    y_train: target 
    x_test: testing_data
    n_est: no. of estimators
    m_dep: max_depth
    '''
    rf = RandomForestClassifier(n_estimators= n_est, max_depth = m_dep,n_jobs = 6,random_state = 42,oob_score=True)
    model = rf.fit(x_train,y_train)
    print(rf.score(x_train,y_train))
    return model.predict(x_test) 

def pred_missing(df,pred_col, cols,trans_col):
    
    lb = LabelEncoder()
    x_train = df[df[pred_col].notnull()][cols]
    x_test = df[df[pred_col].isnull()][cols]
    if trans_col != None:
        for i in trans_col:
            x_train[i] = lb.fit_transform(x_train[i].astype(str))
            x_test[i] = lb.fit_transform(x_test[i].astype(str))
        
    y_train = lb.fit_transform(df[df[pred_col].notnull()][pred_col])
    
    
    return lb.inverse_transform(rf_classifier(x_train,y_train,x_test))

### Function to apply SMOTE

In [55]:
def apply_smote(x,y,cols):
    sm = SMOTEENN(random_state = 42,kind_smote='borderline2',kind_enn = 'mode')
    train_x,train_y = sm.fit_sample(x,y)
    train_x = pd.DataFrame(train_x,columns=cols)
    city_development_index = train_x.city_development_index
    train_x = np.round(train_x.drop('city_development_index',1))
    train_x['city_development_index'] = city_development_index
    
    return train_x,train_y

### Function to resample

In [568]:
def resampling(df,length):
    df_majority = df[df.target == 0]
    df_minority = df[df.target == 1]
    
    df_minority_upsampled = resample(df_minority,
                                    replace = True,
                                    n_samples = length,
                                    random_state = 123)
    df_majority = resample(df_majority,
                          replace = True,
                          n_samples = 2*length)
    
    df_upscaled = pd.concat([df_majority, df_minority_upsampled])
    
    return df_upscaled.drop('target',1),df_upscaled.target

### Function to predict on validation

In [569]:
def predict_val(df,target,clf,test_size = 0.2,smote = False,resample = False,length = 10000):
    train_x,valid_x, train_y, valid_y = train_test_split(df,target,test_size = test_size,random_state = 42,shuffle = True)
    
    if smote == True:
        train_x,train_y = apply_smote(train_x,train_y, valid_x.columns)
        valid_x.columns = train_x.columns
    elif resample == True:
        combined_df = train_x.copy()
        combined_df['target'] = train_y
        train_x,train_y= resampling(combined_df,length)
    
    model = clf.fit(train_x,train_y)
    y_preds = model.predict(valid_x)
        
    return valid_y, y_preds,model,train_x


### Function to predict on test

In [170]:
def predict_test(train,target,test,clf,smote = False,resample = False):
    if smote == True:
        train,target = apply_smote(train,target,test.columns)
        test.columns = train.columns
    elif resample == True:
        combined_df = train.copy()
        combined_df['target'] = target
        train,target = resampling(combined_df)
        test.columns = train.columns
        
    model = clf.fit(train,target)
    y_preds = model.predict(test)
    return y_preds,model,train

## Reading the data 

In [12]:
train_set = read_data('./Datasets/Analytics Vidhya Datafest Supremacy/train.csv')
test_set = read_data('./Datasets/Analytics Vidhya Datafest Supremacy/test.csv')

train_set.columns= ['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'enrolled_university_1',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target']
test_set.columns = ['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'enrolled_university_1',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours']

## Converting exp and last_job to numeric

In [13]:
#convert experience to numeric
train_set.loc[train_set.experience == '>20', 'experience'] = '21'
train_set.loc[train_set.experience == '<1', 'experience'] = '0'
train_set.experience = pd.to_numeric(train_set.experience)

test_set.loc[test_set.experience == '>20', 'experience'] = '21'
test_set.loc[test_set.experience == '<1', 'experience'] = '0'
test_set.experience = pd.to_numeric(test_set.experience)


In [14]:
train_set.loc[train_set.experience.isnull(), 'experience'] = int(np.nanmean(train_set.experience))
test_set.loc[test_set.experience.isnull(), 'experience'] = int(np.nanmean(test_set.experience))

In [15]:
#convert last_new_job to numeric
train_set.loc[train_set.last_new_job == '>4', 'last_new_job'] = '5'
train_set.loc[train_set.last_new_job == 'never', 'last_new_job'] = '0'
train_set.last_new_job = pd.to_numeric(train_set.last_new_job)


test_set.loc[test_set.last_new_job == '>4', 'last_new_job'] = '5'
test_set.loc[test_set.last_new_job == 'never', 'last_new_job'] = '0'
test_set.last_new_job = pd.to_numeric(test_set.last_new_job)


## Consider missing values as another category

In [16]:
def new_value(df,cols):
    for i in cols:
        df.loc[df[i].isnull(), i] = 'not null'
    return df

In [17]:
train_temp = train_set.copy()
test_temp = test_set.copy()


In [18]:
cols = ['gender','major_discipline','company_size','company_type']
train_temp = new_value(train_temp,cols)
test_temp = new_value(test_temp,cols)

In [19]:
# imputing missing value for last_new_job
exp_median = [np.median(train_temp[(train_temp.last_new_job.notnull()) & (train_temp.last_new_job == i)].experience) 
              for i in np.sort(train_temp.last_new_job.unique()[:-1])]
train_temp.loc[train_temp.last_new_job.isnull() , 'last_new_job'] = 1
test_temp.loc[test_temp.last_new_job.isnull() , 'last_new_job'] = 1


In [20]:
train_temp = train_temp[(train_temp.enrolled_university.notnull()) & (train_temp.enrolled_university_1.notnull())]

In [21]:
col_for_en_u = ['city_development_index','experience','last_new_job','relevent_experience','company_type','company_size']
cat_cols = ['relevent_experience','company_type','company_size']

# train_temp.loc[train_temp.enrolled_university.isnull(), 'enrolled_university'] = pred_missing(train_temp,
#                                                                                               'enrolled_university',
#                                                                                               col_for_en_u,
#                                                                                               cat_cols)

test_temp.loc[test_temp.enrolled_university.isnull(), 'enrolled_university'] = pred_missing(test_temp,
                                                                                              'enrolled_university',
                                                                                              col_for_en_u,
                                                                                              cat_cols)
# train_temp.loc[train_temp.enrolled_university_1.isnull(), 'enrolled_university_1'] = pred_missing(train_temp,
#                                                                                               'enrolled_university_1',
#                                                                                               col_for_en_u,
#                                                                                               cat_cols)

test_temp.loc[test_temp.enrolled_university_1.isnull(), 'enrolled_university_1'] = pred_missing(test_temp,
                                                                                              'enrolled_university_1',
                                                                                              col_for_en_u,
                                                                                              cat_cols)

0.8169854836521503


  if diff:


0.6814576781074798


  if diff:


## Predictions

In [22]:
converted_train = convert(train_temp,['city', 'gender', 'relevent_experience',
       'enrolled_university', 'enrolled_university_1', 'major_discipline',
        'company_size', 'company_type'])
converted_test = convert(test_temp,['city', 'gender', 'relevent_experience',
       'enrolled_university', 'enrolled_university_1', 'major_discipline',
        'company_size', 'company_type'])

### Xgboost

* We're getting roc_auc_score of 0.50 on stock data with missing value imputation

In [583]:
xgboostmodel = xgbc(random_state= 42, n_jobs=6,scale_pos_weight=7,subsample = 0.8,eta = 0.01)

In [584]:
y_true, y_preds,model,train = predict_val(converted_train.drop(['enrollee_id','target','enrolled_university_1'],1),
                              converted_train.target,xgboostmodel)

roc_auc_score(y_true,y_preds)

  if diff:


0.6391170207353679

In [None]:
plot_imp_feat(converted_train.drop(['enrollee_id','target'],1).columns,model.feature_importances_)

In [581]:
y_preds,model,_ = predict_test(converted_train.drop(['enrollee_id','target'],1),
                              converted_train.target,
                               converted_test.drop(['enrollee_id'],1),
                               xgboostmodel)
make_submission(converted_test.enrollee_id,y_preds,['enrollee_id','target'],'submission_supermacy.csv')

  if diff:


### RandomForest

In [552]:
rfmodel = RandomForestClassifier(n_jobs= 6,random_state=42,class_weight={0:1,1:4})

In [553]:
y_true, y_preds,model,_ = predict_val(converted_train.drop(['enrollee_id','target','enrolled_university_1'],1),
                              converted_train.target,rfmodel)

roc_auc_score(y_true,y_preds)

0.5190724684027298

### Catboost

* Getting .50 on stock data with missing value imputation

In [516]:
catboostmodel = CBC(random_state=42,verbose = False,thread_count = 6,scale_pos_weight=6,
                    iterations=2000,
                    learning_rate=0.01,depth = 4)

y_true, y_preds, model,_ = predict_val(converted_train.drop(['enrollee_id','target'],1),
                                    converted_train.target, catboostmodel)

In [517]:
roc_auc_score(y_true,y_preds)

0.6284582567173724

### Logistic Regression

In [28]:
logisticregressionmodel = LogisticRegression()
y_true, y_preds, model = predict_val(converted_train.drop(['enrollee_id','target'],1),
                                    converted_train.target, logisticregressionmodel)

In [29]:
roc_auc_score(y_true,y_preds)

0.5

## Applying Oversampling (SMOTE)

### Xgboost

In [60]:
y_true,y_preds, model,_ = predict_val(converted_train.drop(['enrollee_id','target'],1),
                                    converted_train.target,xgboostmodel,0.2, True)
roc_auc_score(y_true,y_preds)

  if diff:


0.5293965448321474

### Catboost

In [59]:
y_true,y_preds, model,train_x = predict_val(converted_train.drop(['enrollee_id','target'],1),
                                    converted_train.target,catboostmodel,0.2, True)
roc_auc_score(y_true,y_preds)



0.5024721820467908

## New Features

In [558]:
train_new = converted_train.copy()
test_new = converted_test.copy()


In [559]:
train_new['dev_ind_range'] = pd.cut(train_new.city_development_index,bins= 30)
train_new['training_hours_range'] = pd.cut(train_new.training_hours, bins = 40)
train_new = convert(train_new,['dev_ind_range','training_hours_range'])

test_new['dev_ind_range'] = pd.cut(test_new.city_development_index,bins = 30)
test_new['training_hours_range'] = pd.cut(test_new.training_hours, bins = 40)
test_new = convert(test_new, ['dev_ind_range','training_hours_range'])

In [560]:
def new_city(df):
    city_pop = pd.DataFrame()
    city_pop['city'] = df.city.unique()
    city_pop['city_population'] = [len(df[df.city == i]) for i in df.city.unique()]
    city_pop['avg_experience'] = [np.median(df[df.city == i].experience) for i in df.city.unique()]
    city_pop['avg_training_hours'] = [(np.median(df[df.city == i].training_hours)) for i in df.city.unique()]
    df = df.merge(city_pop, on= 'city', how = 'inner')
    
    return df

In [561]:
train_new = new_city(train_new)
test_new = new_city(test_new)

In [563]:
xgbmodel = xgbc(random_state = 42, n_jobs = 6,scale_pos_weight=8,subsample = 0.95)

y_true,y_preds, model, _ = predict_val(train_new.drop(['enrollee_id','target'],1),
                                      train_new.target,xgbmodel)
roc_auc_score(y_true,y_preds)

  if diff:


0.6433348161843926

In [566]:
y_preds,model,_ = predict_test(train_new.drop(['enrollee_id','target'],1),
                              train_new.target,
                               test_new.drop(['enrollee_id'],1),
                               xgbmodel)

make_submission(converted_test.enrollee_id,y_preds,['enrollee_id','target'],'submission_supermacy.csv')

  if diff:


In [567]:
pd.value_counts(y_preds)

0    8389
1    6632
dtype: int64

In [480]:
pd.crosstab(
    pd.Series(y_true, name = 'Actual'),
    pd.Series(y_preds, name = 'Predictions'),
    margins = True
)

Predictions,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1496,1113,2609
1,213,183,396
All,1709,1296,3005
