Import the dependencies

In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb

from sklearn import *
from IPython.display import display

Load the files into pandas dataframe

In [66]:
dtype = {
    'id': str,
    'teacher_id': str,
    'teacher_prefix': str,
    'school_state': str,
    'project_submitted_datetime': str,
    'project_grade_category': str,
    'project_subject_categories': str,
    'project_subject_subcategories': str,
    'project_title': str,
    'project_essay_1': str,
    'project_essay_2': str,
    'project_essay_3': str,
    'project_essay_4': str,
    'project_resource_summary': str,
    'teacher_number_of_previously_posted_projects': int,
    'project_is_approved': np.uint8
}

input_dir = 'data/'
train = pd.read_csv(input_dir + 'train.csv', dtype=dtype)
test = pd.read_csv(input_dir + 'test.csv', dtype=dtype)
resources = pd.read_csv(input_dir + 'resources.csv', dtype=dtype)


## Data Exploration:

In [67]:
train.head()

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,,,My students need 6 Ipod Nano's to create and d...,26,1
1,p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,,,My students need matching shirts to wear for d...,1,0
2,p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,,,My students need the 3doodler. We are an SEM s...,5,1
3,p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",,,My students need balls and other activity equi...,16,0
4,p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,,,My students need a water filtration system for...,42,1


## Data preprocessing:

In [68]:
resources['resources_total'] = resources['quantity'] * resources['price']

dfr = resources.groupby(['id'], as_index=False)[['resources_total']].mean()
dfr = dfr.rename(columns={'resources_total':'resources_total_mean'})
train = pd.merge(train, dfr, how='left', on='id').fillna(' ')
test = pd.merge(test, dfr, how='left', on='id').fillna(' ')

dfr = resources.groupby(['id'], as_index=False)[['quantity']].count()
dfr = dfr.rename(columns={'quantity':'resources_quantity_count'})
train = pd.merge(train, dfr, how='left', on='id').fillna(' ')
test = pd.merge(test, dfr, how='left', on='id').fillna(' ')

print(train.shape, test.shape)

((182080, 18), (78035, 17))


In [69]:
for c in ['teacher_id','teacher_prefix','school_state', 'project_grade_category', 'project_subject_categories', 'project_subject_subcategories']:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[c].unique())+list(test[c].unique()))
    train[c] = lbl.fit_transform(train[c].astype(str))
    test[c] = lbl.fit_transform(test[c].astype(str))
    print(c)

teacher_id
teacher_prefix
school_state
project_grade_category
project_subject_categories
project_subject_subcategories


In [70]:
train['project_submitted_datetime'] = pd.to_datetime(train['project_submitted_datetime']).values.astype(np.int64)
test['project_submitted_datetime'] = pd.to_datetime(test['project_submitted_datetime']).values.astype(np.int64)

In [71]:
max_features_ = 200
print(train.shape, test.shape)

((182080, 18), (78035, 17))


### Text processing using simple NLP techniques

In [72]:
for c in ['project_title', 'project_resource_summary', 'project_essay_1', 'project_essay_2']:
    tfidf = feature_extraction.text.TfidfVectorizer(ngram_range=(1, 2), stop_words='english', min_df=3, max_features=max_features_)
    tfidf.fit(train[c])

    train[c+'_len'] = train[c].apply(lambda x: len(str(x)))
    train[c+'_wc'] = train[c].apply(lambda x: len(str(x).split(' ')))
    features = pd.DataFrame(tfidf.transform(train[c]).toarray())
    features.columns = [c + str(i) for i in range(max_features_)]
    train = pd.concat((train, pd.DataFrame(features)), axis=1, ignore_index=False).reset_index(drop=True)
    
    test[c+'_len'] = test[c].apply(lambda x: len(str(x)))
    test[c+'_wc'] = test[c].apply(lambda x: len(str(x).split(' ')))
    features = pd.DataFrame(tfidf.transform(test[c]).toarray())
    features.columns = [c + str(i) for i in range(max_features_)]
    test = pd.concat((test, pd.DataFrame(features)), axis=1, ignore_index=False).reset_index(drop=True)
    
    print(c)

print(train.shape, test.shape)

project_title
project_resource_summary
project_essay_1
project_essay_2
((182080, 826), (78035, 825))


In [73]:
col = ['id', 'project_is_approved', 'project_resource_summary', 'project_title', 'project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4', 'project_submitted_datetime', 'project_subject_categories', 'project_subject_subcategories']
col = [c for c in train.columns if c not in col]

## Build and train the model

In [74]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(train[col],train['project_is_approved'], test_size=0.25, random_state=18)

params = {'eta': 0.1, 'max_depth': 6, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 18, 'silent': True}
watchlist = [(xgb.DMatrix(x_train, y_train), 'train'), (xgb.DMatrix(x_test, y_test), 'valid')]
model = xgb.train(params, xgb.DMatrix(x_train, y_train), 450,  watchlist, verbose_eval=10, early_stopping_rounds=20)

[0]	train-auc:0.720373	valid-auc:0.700312
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 20 rounds.
[10]	train-auc:0.74366	valid-auc:0.715273
[20]	train-auc:0.757223	valid-auc:0.72291
[30]	train-auc:0.770388	valid-auc:0.729587
[40]	train-auc:0.783053	valid-auc:0.73489
[50]	train-auc:0.793445	valid-auc:0.738684
[60]	train-auc:0.802091	valid-auc:0.741394
[70]	train-auc:0.810028	valid-auc:0.743415
[80]	train-auc:0.81637	valid-auc:0.744806
[90]	train-auc:0.822164	valid-auc:0.746174
[100]	train-auc:0.827448	valid-auc:0.747315
[110]	train-auc:0.832848	valid-auc:0.748237
[120]	train-auc:0.837904	valid-auc:0.749026
[130]	train-auc:0.842211	valid-auc:0.749667
[140]	train-auc:0.845443	valid-auc:0.75013
[150]	train-auc:0.849123	valid-auc:0.75037
[160]	train-auc:0.852547	valid-auc:0.750824
[170]	train-auc:0.855841	valid-auc:0.75116
[180]	train-auc:0.858857	valid-auc:0.751538
[190]	train-auc:0.862406	valid-auc:0.75

## Predicting and validating the outcome using trained model

In [75]:
test['project_is_approved'] = model.predict(xgb.DMatrix(test[col]), ntree_limit=model.best_ntree_limit)
test['project_is_approved'] = test['project_is_approved'].clip(0+1e12, 1-1e12)

In [77]:
test[['id','project_is_approved']].to_csv('exp6_submission.csv', index=False)