# Motivation

&#9680; There will be a <font color=green><u>**HackerEarth Test Link**</u></font>

&#9680; Test format is as follows :
    
    - 12 MCQs
    - 1 Predictive Modeling 
    
&#9680; This notebook consist of my code on Predictive modeling Assignment 

&#9680; [Test Link](https://www.hackerearth.com/challenges/test/makemytrip-ds-aug/?login=a97b96470e7e5900dd2afbe1179c79a3)

# Load Libraries 

In [2]:
from sklearn import metrics
import numpy as np
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import xgboost as xgb
import swifter
import warnings
import matplotlib.pyplot as plt
import pycm
import pandas as pd
import numpy as np
import warnings
import os
import re
import math
import time
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('max_colwidth', 999)
pd.set_option('display.max_columns', 999)
pd.set_option("display.max_rows", 999)

# Tools & Utilities 

## Pre-Processing 

In [4]:
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    PUNCT_TO_REMOVE = string.punctuation
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    STOPWORDS = set(stopwords.words('english'))
    return " ".join(
        [word for word in str(text).split() if word not in STOPWORDS])
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    wordnet_map = {
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "J": wordnet.ADJ,
        "R": wordnet.ADV
    }
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([
        lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
        for word, pos in pos_tagged_text
    ])

In [5]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [6]:
def correct_spellings(text):
    spell = SpellChecker()
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [7]:
def pre_process(input_df, column):
    input_df[column] = input_df[column].astype(str)
    input_df["pre_processed_" + str(column)] = input_df[column].str.lower()
    input_df["pre_processed_" +
             str(column)] = input_df["pre_processed_clean_hm"].swifter.apply(
                 lambda text: remove_punctuation(text))
    input_df["pre_processed_" + str(column)] = input_df[
        "pre_processed_" +
        str(column)].swifter.apply(lambda text: remove_stopwords(text))
    input_df["pre_processed_" + str(column)] = input_df[
        "pre_processed_" +
        str(column)].swifter.apply(lambda text: lemmatize_words(text))
    input_df["pre_processed_" + str(column)] = input_df[
        "pre_processed_" +
        str(column)].swifter.apply(lambda text: chat_words_conversion(text))
    return input_df

## Feature Engineering 

In [8]:
def entropy(s):
    l = float(len(s))
    return (-sum(map(lambda a: (a / l) * math.log2(a / l),
                     Counter(s).values())))

In [9]:
pos_family = {
    'noun': ['NN', 'NNS', 'NNP', 'NNPS'],
    'pron': ['PRP', 'PRP$', 'WP', 'WP$'],
    'verb': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
    'adj': ['JJ', 'JJR', 'JJS'],
    'adv': ['RB', 'RBR', 'RBS', 'WRB']
}


def check_pos_tag(x, flag):
    cnt = 0
    try:
        k = pd.Series([
            list(x)[1] for x in nltk.pos_tag(nltk.word_tokenize(str(x)))
        ]).value_counts()
        for i in pos_family[flag]:
            if i in k.index.tolist():
                cnt = cnt + k[i]
    except:
        pass
    return cnt

In [10]:
def extract_density_based_features(data, column='pre_processed_clean_hm'):
    feature_df = pd.DataFrame()

    feature_df['char_count'] = data[column].apply(len)
    feature_df['content_word_count'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: len(str(x).split(" ")))
    feature_df['noun_count'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: check_pos_tag(x, 'noun'))
    feature_df['verb_count'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: check_pos_tag(x, 'verb'))
    feature_df['adj_count'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: check_pos_tag(x, 'adj'))
    feature_df['adv_count'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: check_pos_tag(x, 'adv'))
    feature_df['pron_count'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: check_pos_tag(x, 'pron'))
    feature_df['word_density'] = feature_df['char_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['avg_word_length'] = feature_df['content_word_count'] / \
        (feature_df['char_count']+1)
    feature_df['noun_density'] = feature_df['noun_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['verb_density'] = feature_df['verb_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['adj_density'] = feature_df['adj_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['adv_density'] = feature_df['adv_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['pron_density'] = feature_df['pron_count'] / (
        feature_df['content_word_count'] + 1)
    feature_df['avg_pos_density'] = (feature_df[[
        'noun_density', 'verb_density', 'adj_density', 'adv_density',
        'pron_density'
    ]].sum(axis=1) / 5)
    return feature_df

In [11]:
def add_static_column_features(data, column='cleaned_hm'):
    feature_df = pd.DataFrame()

    feature_df['no_of_dots'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('.')).copy()
    feature_df['no_of_hyphen'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('-')).copy()
    feature_df['no_of_underscore'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('_')).copy()
    feature_df['no_of_equal'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('=')).copy()
    feature_df['no_of_forward_slash'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('/')).copy()
    feature_df['no_of_question_marks'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('?')).copy()
    feature_df['no_of_semicolon'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count(';')).copy()
    feature_df['no_of_open_parenthesis'] = data[
        column].swifter.set_npartitions(8).apply(
            lambda x: str(x).count('(')).copy()
    feature_df['no_of_close_parenthesis'] = data[
        column].swifter.set_npartitions(8).apply(
            lambda x: str(x).count(')')).copy()
    feature_df['no_of_mod'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('%')).copy()
    feature_df['no_of_ampersand'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('&')).copy()
    feature_df['no_of_@'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('@')).copy()
    feature_df['no_of_double_slash'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('//')).copy()
    feature_df['no_of_digits'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: sum(c.isdigit() for c in x)).copy()
    feature_df['column_entropy'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: entropy(x)).copy()
    feature_df['no_of_characters'] = data[column].swifter.set_npartitions(
        8).apply(len)
    feature_df['no_of_tokens'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: len(str(x).split()))
    feature_df['no_of_backslash'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('/'))
    feature_df['no_of_double_backslash'] = data[
        column].swifter.set_npartitions(8).apply(lambda x: str(x).count('//'))
    feature_df['no_of_open_sq_bracket'] = data[column].swifter.set_npartitions(
        8).apply(lambda x: str(x).count('['))
    feature_df['no_of_close_sq_bracket'] = data[
        column].swifter.set_npartitions(8).apply(lambda x: str(x).count(']'))
    feature_df['no_of_open_curly_brace'] = data[
        column].swifter.set_npartitions(8).apply(lambda x: str(x).count('{'))
    feature_df['no_of_close_curly_brace'] = data[
        column].swifter.set_npartitions(8).apply(lambda x: str(x).count('}'))
    feature_df['no_of_dollor'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('$'))
    feature_df['no_of_open_curly_brace'] = data[
        column].swifter.set_npartitions(8).apply(lambda x: str(x).count('{'))
    feature_df['no_of_plus'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: str(x).count('+'))
    feature_df['no_of_special_character'] = data[
        column].swifter.set_npartitions(8).apply(
            lambda x: int(len(str(x) - len(re.findall('[\w]', str(x))))))
    feature_df['no_of_A_Z'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: len(re.findall(r'[A-Z]', str(x))))
    feature_df['no_of_a_z'] = data[column].swifter.set_npartitions(8).apply(
        lambda x: len(re.findall(r'[a-z]', str(x))))
    feature_df['came_case_ratio'] = feature_df['no_of_A_Z'] / (
        feature_df['no_of_a_z'] + 1)
    return feature_df

# Load Data

In [18]:
train = pd.read_csv("C:\\Users\\Zeus\\Downloads\\Road_To_Glory\\MakeMyTrip\\bff5c81a058811ec\\dataset\\train.csv")
test  = pd.read_csv("C:\\Users\\Zeus\\Downloads\\Road_To_Glory\\MakeMyTrip\\bff5c81a058811ec\\dataset\\test.csv")

## Data Understanding 

In [19]:
print(train.shape)
print(train.columns)
train.head()

(73147, 16)
Index(['id', 'program_id', 'program_type', 'program_duration', 'test_id',
       'test_type', 'difficulty_level', 'invigilator', 'gender', 'education',
       'city_tier', 'age', 'total_programs_enrolled', 'is_handicapped',
       'invigilator_engagement_rating', 'is_pass'],
      dtype='object')


Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,invigilator,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,invigilator_engagement_rating,is_pass
0,9389_150,Y_1,Y,136,150,offline,intermediate,9389,M,High school (10th grade),3,24.0,5,N,1.0,0
1,16523_44,T_1,T,131,44,offline,easy,16523,F,High School (12th grade),4,26.0,2,N,3.0,1
2,13987_178,Z_2,Z,120,178,online,easy,13987,M,High school (10th grade),1,40.0,1,N,2.0,1
3,13158_32,T_2,T,117,32,offline,easy,13158,F,High school (10th grade),3,,4,N,1.0,1
4,10591_84,V_3,V,131,84,offline,intermediate,10591,F,High School (12th grade),1,42.0,2,N,4.0,1


In [20]:
print(test.shape)
print(test.columns)
test.head()

(31349, 15)
Index(['id', 'program_id', 'program_type', 'program_duration', 'test_id',
       'test_type', 'difficulty_level', 'invigilator', 'gender', 'education',
       'city_tier', 'age', 'total_programs_enrolled', 'is_handicapped',
       'invigilator_engagement_rating'],
      dtype='object')


Unnamed: 0,id,program_id,program_type,program_duration,test_id,test_type,difficulty_level,invigilator,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,invigilator_engagement_rating
0,1626_45,T_1,T,131,45,offline,Medium,1626,F,High school (10th grade),3,46.0,2,N,4.0
1,11020_130,Y_3,Y,135,130,online,Easy,11020,M,Bachelor's,3,,4,N,4.0
2,12652_146,Y_2,Y,120,146,online,Easy,12652,M,High school (10th grade),3,,2,N,3.0
3,7038_72,V_4,V,122,72,offline,Very hard,7038,F,High School (12th grade),1,,2,N,2.0
4,888_71,V_4,V,122,71,offline,Medium,888,F,High school (10th grade),3,,2,N,2.0


## Data Pre-Processing 

In [21]:
train.dtypes

id                                object
program_id                        object
program_type                      object
program_duration                   int64
test_id                            int64
test_type                         object
difficulty_level                  object
invigilator                        int64
gender                            object
education                         object
city_tier                          int64
age                              float64
total_programs_enrolled            int64
is_handicapped                    object
invigilator_engagement_rating    float64
is_pass                            int64
dtype: object

In [22]:
train.isnull().sum()

id                                   0
program_id                           0
program_type                         0
program_duration                     0
test_id                              0
test_type                            0
difficulty_level                     0
invigilator                          0
gender                               0
education                            0
city_tier                            0
age                              27729
total_programs_enrolled              0
is_handicapped                       0
invigilator_engagement_rating       77
is_pass                              0
dtype: int64

In [46]:
test.isnull().sum()

id                                   0
program_id                           0
program_type                         0
program_duration                     0
test_id                              0
test_type                            0
difficulty_level                     0
invigilator                          0
gender                               0
education                            0
city_tier                            0
age                              11791
total_programs_enrolled              0
is_handicapped                       0
invigilator_engagement_rating       31
dtype: int64

In [23]:
train.is_pass.value_counts()

1    50867
0    22280
Name: is_pass, dtype: int64

In [26]:
train.apply(lambda x : len(np.unique(x)))

id                               73147
program_id                          22
program_type                         7
program_duration                    10
test_id                            188
test_type                            2
difficulty_level                     4
invigilator                      18500
gender                               2
education                            5
city_tier                            4
age                              27774
total_programs_enrolled             13
is_handicapped                       2
invigilator_engagement_rating       82
is_pass                              2
dtype: int64

In [36]:
pd.crosstab(train.is_handicapped,train.is_pass)

is_pass,0,1
is_handicapped,Unnamed: 1_level_1,Unnamed: 2_level_1
N,19923,46654
Y,2357,4213


# Feature Engineering 

In [157]:
feature_df_train = pd.DataFrame()
feature_df_test = pd.DataFrame()

In [158]:
feature_df_train['p_type'] = pd.Series(
    np.where(
        train.program_type == 'Y', 0,
        np.where(
            train.program_type == 'T', 1,
            np.where(
                train.program_type == 'Z', 2,
                np.where(
                    train.program_type == 'V', 3,
                    np.where(train.program_type == 'U', 4,
                             np.where(train.program_type == 'X', 5, 6)))))))

In [159]:
feature_df_test['p_type'] = pd.Series(
    np.where(
        test.program_type == 'Y', 0,
        np.where(
            test.program_type == 'T', 1,
            np.where(
                test.program_type == 'Z', 2,
                np.where(
                    test.program_type == 'V', 3,
                    np.where(test.program_type == 'U', 4,
                             np.where(test.program_type == 'X', 5, 6)))))))

In [160]:
feature_df_train['program_duration'] = train.program_duration
feature_df_test['program_duration'] =test.program_duration
feature_df_train['t_type'] = train.test_type.apply(lambda x: 1 if str(x)=='offline' else 0)
feature_df_test['t_type'] = test.test_type.apply(lambda x: 1 if str(x)=='offline' else 0)

In [161]:
feature_df_train['d_level'] = np.where(
    train.difficulty_level == 'easy', 0,
    np.where(train.difficulty_level == 'intermediate', 1,
             np.where(train.difficulty_level == 'hard', 2, 3)))

In [162]:
feature_df_test['d_level'] = np.where(
    test.difficulty_level == 'easy', 0,
    np.where(test.difficulty_level == 'intermediate', 1,
             np.where(test.difficulty_level == 'hard', 2, 3)))

In [163]:
feature_df_train['g'] = train.gender.apply(lambda x: 1 if str(x)=='F' else 0)
feature_df_test['g'] = test.gender.apply(lambda x: 1 if str(x)=='F' else 0)

In [164]:
feature_df_train['e_level'] = np.where(
    train.education == 'No qualification', 0,
    np.where(
        train.education == 'High school (10th grade)', 1,
        np.where(train.education == 'High School (12th grade)', 2,
                 np.where(train.education == "Bachelor's", 3, 4))))

In [165]:
feature_df_test['e_level'] = np.where(
    test.education == 'No qualification', 0,
    np.where(
        test.education == 'High school (10th grade)', 1,
        np.where(test.education == 'High School (12th grade)', 2,
                 np.where(test.education == "Bachelor's", 3, 4))))

In [166]:
feature_df_train['c_level'] = train.city_tier
feature_df_test['c_level'] = test.city_tier
feature_df_train['age'] = train.age.fillna(train.age.mean())
feature_df_test['age'] = test.age.fillna(train.age.mean())
feature_df_train['total_programs_enrolled'] = train.total_programs_enrolled
feature_df_test['total_programs_enrolled'] = test.total_programs_enrolled
feature_df_train['is_handicapped'] = train.is_handicapped.apply(
    lambda x: 1 if str(x) == 'Y' else 0)
feature_df_test['is_handicapped'] = test.is_handicapped.apply(
    lambda x: 1 if str(x) == 'Y' else 0)
feature_df_train[
    'invigilator_engagement_rating'] = train.invigilator_engagement_rating.fillna(
        -99)
feature_df_test[
    'invigilator_engagement_rating'] = test.invigilator_engagement_rating.fillna(
        -99)

In [167]:
feature_df_train[
    'target'] = train.is_pass

In [168]:
print(feature_df_train.shape)
display(feature_df_train.head())

(73147, 12)


Unnamed: 0,p_type,program_duration,t_type,d_level,g,e_level,c_level,age,total_programs_enrolled,is_handicapped,invigilator_engagement_rating,target
0,0,136,1,1,0,1,3,24.0,5,0,1.0,0
1,1,131,1,0,1,2,4,26.0,2,0,3.0,1
2,2,120,0,0,0,1,1,40.0,1,0,2.0,1
3,1,117,1,0,1,1,3,36.494033,4,0,1.0,1
4,3,131,1,1,1,2,1,42.0,2,0,4.0,1


In [169]:
print(feature_df_test.shape)
display(feature_df_test.head())

(31349, 11)


Unnamed: 0,p_type,program_duration,t_type,d_level,g,e_level,c_level,age,total_programs_enrolled,is_handicapped,invigilator_engagement_rating
0,1,131,1,3,1,1,3,46.0,2,0,4.0
1,0,135,0,3,0,3,3,36.494033,4,0,4.0
2,0,120,0,3,0,1,3,36.494033,2,0,3.0
3,3,122,1,3,1,2,1,36.494033,2,0,2.0
4,3,122,1,3,1,1,3,36.494033,2,0,2.0


# Feature Selection

## Drop Duplicates 

In [124]:
feature_df_train = feature_df_train[~feature_df_train.duplicated()]

## Correlation Drop

In [125]:
cor_matrix = feature_df_train[feature_df_train.columns.tolist()[:-1]].corr().abs()
upper_tri = cor_matrix.where(
    np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool))
to_drop = [
    column for column in upper_tri.columns if any(upper_tri[column] > 0.90)
]
del cor_matrix, upper_tri

## Zero Variance Drop

In [126]:
feature_df_train.drop(columns=[
    col for col in feature_df_train.columns if len(feature_df_train[col].unique()) < 2
],
                inplace=True)

## Infinity Drop

In [127]:
feature_df_train.columns.to_series()[np.isinf(feature_df_train).any()]

Series([], dtype: object)

# Model Train 

In [170]:
clf = xgb.XGBClassifier(max_depth=55,
                        learning_rate=0.001,
                        n_estimators=500,
                        verbosity=1,
                        silent=None,
                        objective='binary:logistic',
                        booster='gbtree',
                        n_jobs=-1,
                        gamma=0,
                        min_child_weight=1,
                        max_delta_step=0,
                        subsample=1,
                        colsample_bytree=1,
                        colsample_bynode=1,
                        colsample_bylevel=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        class_weight={
                            1: 0.7,
                            0: 0.3
                        },
                        base_score=0.5,
                        random_state=9,
                        missing=-99)
clf.fit(feature_df_train[feature_df_train.columns[:-1]],
        feature_df_train.target)

Parameters: { "class_weight" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', class_weight={0: 0.3, 1: 0.7},
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.001, max_delta_step=0,
              max_depth=55, min_child_weight=1, missing=-99,
              monotone_constraints='()', n_estimators=500, n_jobs=-1,
              num_parallel_tree=1, random_state=9, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, silent=None, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=1)

# Model Evaluate 

In [171]:
pd.Series(clf.predict(feature_df_test)).value_counts()

0    16555
1    14794
dtype: int64

In [172]:
100 * metrics.f1_score(feature_df_train.target,
                       clf.predict(
                           feature_df_train[feature_df_train.columns[:-1]]),
                       average='weighted')

79.96884502277891

In [173]:
pd.DataFrame({'id':test.id,
              'is_pass':list(clf.predict(feature_df_test))
             }).to_csv("submission_1.csv",index=False)