# Data loading

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
train = pd.read_csv("train/train.csv")
train.sort_values('project_submitted_datetime').head(3)
test = pd.read_csv("test/test.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
res = pd.read_csv('resources/resources.csv')
res['amount'] = res['quantity']*res['price']
res.head(3)

Unnamed: 0,id,description,quantity,price,amount
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95,44.85
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45,8.45


In [None]:
amount_df = res.groupby('id')['amount'].agg('sum').sort_values(ascending=False).reset_index()
amount_df.head(5)

In [None]:
resource_amount_map = {}
for i, row in amount_df.iterrows():
    resource_amount_map[row['id']] = row['amount']

train.insert(4,'total_amount',train['id'].map(resource_amount_map))
#train.tail(3)
test.insert(4,'total_amount',test['id'].map(resource_amount_map))
test.tail(3)

# Text feature extraction

In [3]:
train['project_essay'] = train.apply(lambda row: ' '.join([
    str(row['teacher_prefix']), 
    str(row['school_state']), 
    str(row['project_grade_category']), 
    str(row['project_subject_categories']), 
    str(row['project_subject_subcategories']), 
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']), 
    str(row['project_essay_4']),
    ]), axis=1)
test['project_essay'] = test.apply(lambda row: ' '.join([
    str(row['teacher_prefix']), 
    str(row['school_state']), 
    str(row['project_grade_category']), 
    str(row['project_subject_categories']), 
    str(row['project_subject_subcategories']), 
    str(row['project_essay_1']), 
    str(row['project_essay_2']), 
    str(row['project_essay_3']), 
    str(row['project_essay_4']),
    ]), axis=1)

In [4]:
def extract_features(df):
    df['project_title_len'] = df['project_title'].apply(lambda x: len(str(x)))
    df['project_essay_1_len'] = df['project_essay_1'].apply(lambda x: len(str(x)))
    df['project_essay_2_len'] = df['project_essay_2'].apply(lambda x: len(str(x)))
    df['project_essay_3_len'] = df['project_essay_3'].apply(lambda x: len(str(x)))
    df['project_essay_4_len'] = df['project_essay_4'].apply(lambda x: len(str(x)))
    df['project_resource_summary_len'] = df['project_resource_summary'].apply(lambda x: len(str(x)))
  
extract_features(train)
extract_features(test)

# Sentiment feature extraction

In [5]:
from textblob import TextBlob
def get_polarity(text):
    textblob = TextBlob(text)
    pol = textblob.sentiment.polarity
    return round(pol,3)

def get_subjectivity(text):
    textblob = TextBlob(text)
    subj = textblob.sentiment.subjectivity
    return round(subj,3)

train['polarity'] = train['project_essay'].apply(get_polarity)
train['subjectivity'] = train['project_essay'].apply(get_subjectivity)
test['polarity'] = test['project_essay'].apply(get_polarity)
test['subjectivity'] = test['project_essay'].apply(get_subjectivity)

In [6]:
train = train.drop([
    'project_essay_1', 
    'project_essay_2', 
    'project_essay_3', 
    'project_essay_4'], axis=1)
test = test.drop([
    'project_essay_1', 
    'project_essay_2', 
    'project_essay_3', 
    'project_essay_4'], axis=1)

In [7]:
res = pd.DataFrame(res[['id', 'price']].groupby('id').price.agg(\
    [
        'count', 
        'sum', 
        'min', 
        'max', 
        'mean', 
        'std', 
        'median',
        lambda x: len(np.unique(x)),
    ])).reset_index()
print(res.head())

        id  count      sum    min     max        mean         std  median  \
0  p000001      4   459.56  23.99  261.08  114.890000  101.929679  87.245   
1  p000002     14   515.89   8.46  134.90   36.849286   33.549557  29.990   
2  p000003      4   298.97  39.99  169.00   74.742500   63.014906  44.990   
3  p000004     95  1113.69   1.60  401.54   11.723053   40.608577   7.260   
4  p000005      4   485.99  54.08  323.75  121.497500  134.835000  54.080   

   <lambda>  
0       4.0  
1      13.0  
2       3.0  
3      36.0  
4       2.0  


In [8]:
train = train.merge(res, on='id', how='left')
test = test.merge(res, on='id', how='left')
del res

# String feature encoding

In [9]:
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

df_all = pd.concat([train, test], axis=0)

cols = [
    'teacher_id', 
    'school_state', 
    'project_subject_categories', 
    'project_subject_subcategories',
    'teacher_prefix', 
    'project_grade_category'
]

for c in tqdm(cols):
    le = LabelEncoder()
    le.fit(df_all[c].astype(str))
    train[c] = le.transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:05<00:00,  1.15it/s]


In [10]:
df_all.head()

Unnamed: 0,<lambda>,count,id,max,mean,median,min,polarity,project_essay,project_essay_1_len,...,project_submitted_datetime,project_title,project_title_len,school_state,std,subjectivity,sum,teacher_id,teacher_number_of_previously_posted_projects,teacher_prefix
0,1.0,2,p036502,149.99,149.99,149.99,149.99,0.213,Ms. NV Grades PreK-2 Literacy & Language Liter...,967,...,2016-11-18 14:45:59,Super Sight Word Centers,24,NV,0.0,0.391,299.98,484aaf11257089a66cfedc9461c6bd0a,26,Ms.
1,1.0,1,p039565,20.0,20.0,20.0,20.0,0.193,"Mrs. GA Grades 3-5 Music & The Arts, Health & ...",587,...,2017-04-26 15:57:28,Keep Calm and Dance On,22,GA,,0.597,20.0,df72a3ba8089423fa8a94be88060f6ed,1,Mrs.
2,1.0,1,p233823,469.99,469.99,469.99,469.99,0.354,"Ms. UT Grades 3-5 Math & Science, Literacy & L...",761,...,2017-01-01 22:57:44,Lets 3Doodle to Learn,21,UT,,0.534,469.99,a9b876a9252e08a55e3d894150f75ba3,5,Ms.
3,5.0,5,p185307,354.99,136.894,126.87,18.95,0.176,Mr. NC Grades 3-5 Health & Sports Health & Wel...,1201,...,2016-08-12 15:42:11,"\""Kid Inspired\"" Equipment to Increase Activit...",72,NC,133.428098,0.416,684.47,525fdbb6ec7f538a48beebaa0a51b24f,16,Mr.
4,1.0,1,p013780,355.5,355.5,355.5,355.5,0.285,Mr. CA Grades 6-8 Health & Sports Health & Wel...,451,...,2016-08-06 09:09:11,We need clean water for our culinary arts class!,48,CA,,0.557,355.5,a63b5547a7239eae4c1872670848e61a,42,Mr.


In [10]:
train['project_submitted_datetime'] = pd.to_datetime(train['project_submitted_datetime']).values.astype(np.int64)
test['project_submitted_datetime'] = pd.to_datetime(test['project_submitted_datetime']).values.astype(np.int64)

# TFIDF implementation

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
cols = [
    'project_title', 
    'project_essay', 
    'project_resource_summary'
]
n_features = [
    350, 
    4000, 
    350
]

for c_i, c in tqdm(enumerate(cols)):
    tfidf = TfidfVectorizer(max_features=n_features[c_i], min_df=3)
    tfidf.fit(df_all[c])
    tfidf_train = np.array(tfidf.transform(train[c]).todense(), dtype=np.float16)
    tfidf_test = np.array(tfidf.transform(test[c]).todense(), dtype=np.float16)

    for i in range(n_features[c_i]):
        train[c + '_tfidf_' + str(i)] = tfidf_train[:, i]
        test[c + '_tfidf_' + str(i)] = tfidf_test[:, i]

3it [05:23, 107.91s/it]


In [13]:
train.head()

Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_resource_summary,...,project_resource_summary_tfidf_390,project_resource_summary_tfidf_391,project_resource_summary_tfidf_392,project_resource_summary_tfidf_393,project_resource_summary_tfidf_394,project_resource_summary_tfidf_395,project_resource_summary_tfidf_396,project_resource_summary_tfidf_397,project_resource_summary_tfidf_398,project_resource_summary_tfidf_399
0,p036502,37239,3,33,1479480359000000000,3,24,328,Super Sight Word Centers,My students need 6 Ipod Nano's to create and d...,...,0.0,0.422363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,p039565,115331,2,10,1493222248000000000,0,42,400,Keep Calm and Dance On,My students need matching shirts to wear for d...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,p233823,87678,3,44,1483311464000000000,0,36,17,Lets 3Doodle to Learn,My students need the 3doodler. We are an SEM s...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,p185307,42475,1,27,1471016531000000000,0,8,300,"\""Kid Inspired\"" Equipment to Increase Activit...",My students need balls and other activity equi...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,p013780,85849,1,4,1470474551000000000,1,8,300,We need clean water for our culinary arts class!,My students need a water filtration system for...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
train.to_csv("train.csv")
test.to_csv("test.csv")

In [1]:
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [16]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,...,project_resource_summary_tfidf_390,project_resource_summary_tfidf_391,project_resource_summary_tfidf_392,project_resource_summary_tfidf_393,project_resource_summary_tfidf_394,project_resource_summary_tfidf_395,project_resource_summary_tfidf_396,project_resource_summary_tfidf_397,project_resource_summary_tfidf_398,project_resource_summary_tfidf_399
0,0,p036502,37239,3,33,1479480359000000000,3,24,328,Super Sight Word Centers,...,0.0,0.422363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,p039565,115331,2,10,1493222248000000000,0,42,400,Keep Calm and Dance On,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,p233823,87678,3,44,1483311464000000000,0,36,17,Lets 3Doodle to Learn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,p185307,42475,1,27,1471016531000000000,0,8,300,"\""Kid Inspired\"" Equipment to Increase Activit...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,p013780,85849,1,4,1470474551000000000,1,8,300,We need clean water for our culinary arts class!,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# X, Y sampling

In [None]:
cols_to_drop = [
    'id',
    'project_title', 
    'project_essay', 
    'project_resource_summary',
    'project_is_approved',
]
X = train.drop(cols_to_drop, axis=1, errors='ignore')
y = train['project_is_approved']
X_test = test.drop(cols_to_drop, axis=1, errors='ignore')
id_test = test['id'].values
feature_names = list(X.columns)
print(X.shape, X_test.shape)

(182080, 4724) (78035, 4724)


In [None]:
X.head()

Unnamed: 0,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,teacher_number_of_previously_posted_projects,project_title_len,project_essay_1_len,...,project_resource_summary_tfidf_190,project_resource_summary_tfidf_191,project_resource_summary_tfidf_192,project_resource_summary_tfidf_193,project_resource_summary_tfidf_194,project_resource_summary_tfidf_195,project_resource_summary_tfidf_196,project_resource_summary_tfidf_197,project_resource_summary_tfidf_198,project_resource_summary_tfidf_199
0,37239,3,33,1479480359000000000,3,24,328,26,24,967,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,115331,2,10,1493222248000000000,0,42,400,1,22,587,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,87678,3,44,1483311464000000000,0,36,17,5,21,761,...,0.0,0.174683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,42475,1,27,1471016531000000000,0,8,300,16,72,1201,...,0.0,0.12262,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,85849,1,4,1470474551000000000,1,8,300,42,48,451,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
X.shape

(182080, 604)

In [27]:
X['project_grade_category'].unique()

array(['Grades PreK-2', 'Grades 3-5', 'Grades 6-8', 'Grades 9-12'], dtype=object)

In [51]:
id_test

array(['p233245', 'p096795', 'p236235', ..., 'p210728', 'p060531',
       'p087783'], dtype=object)

# CatBoost solutions

In [13]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=500, learning_rate=0.03, eval_metric="AUC", depth=5, loss_function='Logloss')
# Fit model
cat_features = [2, 4]
model.fit(X, y, cat_features)

0:	learn: 0.7061932	total: 6.81s	remaining: 56m 40s
1:	learn: 0.7083862	total: 14.1s	remaining: 58m 40s
2:	learn: 0.7093090	total: 20.7s	remaining: 57m 1s
3:	learn: 0.7142486	total: 28.1s	remaining: 58m 9s
4:	learn: 0.7156975	total: 35.8s	remaining: 59m 1s
5:	learn: 0.7153682	total: 43.3s	remaining: 59m 27s
6:	learn: 0.7165087	total: 52.1s	remaining: 1h 1m 7s
7:	learn: 0.7168366	total: 1m	remaining: 1h 1m 35s
8:	learn: 0.7171185	total: 1m 7s	remaining: 1h 1m 32s
9:	learn: 0.7179888	total: 1m 15s	remaining: 1h 1m 24s
10:	learn: 0.7182363	total: 1m 22s	remaining: 1h 1m 21s
11:	learn: 0.7181021	total: 1m 30s	remaining: 1h 1m 13s
12:	learn: 0.7188133	total: 1m 37s	remaining: 1h 54s
13:	learn: 0.7195392	total: 1m 45s	remaining: 1h 53s
14:	learn: 0.7196605	total: 1m 54s	remaining: 1h 1m 47s
15:	learn: 0.7205602	total: 2m 2s	remaining: 1h 1m 50s
16:	learn: 0.7209652	total: 2m 9s	remaining: 1h 1m 16s
17:	learn: 0.7211972	total: 2m 16s	remaining: 1h 1m
18:	learn: 0.7221246	total: 2m 24s	remaini

151:	learn: 0.7481224	total: 18m 35s	remaining: 42m 34s
152:	learn: 0.7482824	total: 18m 42s	remaining: 42m 24s
153:	learn: 0.7484102	total: 18m 49s	remaining: 42m 17s
154:	learn: 0.7485185	total: 18m 56s	remaining: 42m 9s
155:	learn: 0.7486269	total: 19m 4s	remaining: 42m 2s
156:	learn: 0.7487195	total: 19m 11s	remaining: 41m 55s
157:	learn: 0.7488791	total: 19m 17s	remaining: 41m 46s
158:	learn: 0.7489773	total: 19m 24s	remaining: 41m 36s
159:	learn: 0.7491753	total: 19m 31s	remaining: 41m 29s
160:	learn: 0.7493547	total: 19m 38s	remaining: 41m 21s
161:	learn: 0.7495624	total: 19m 45s	remaining: 41m 12s
162:	learn: 0.7496941	total: 19m 51s	remaining: 41m 3s
163:	learn: 0.7498331	total: 19m 59s	remaining: 40m 56s
164:	learn: 0.7500418	total: 20m 5s	remaining: 40m 47s
165:	learn: 0.7502476	total: 20m 13s	remaining: 40m 40s
166:	learn: 0.7503246	total: 20m 20s	remaining: 40m 33s
167:	learn: 0.7503951	total: 20m 26s	remaining: 40m 24s
168:	learn: 0.7505319	total: 20m 33s	remaining: 40m 1

299:	learn: 0.7622914	total: 36m 12s	remaining: 24m 8s
300:	learn: 0.7623398	total: 36m 20s	remaining: 24m 1s
301:	learn: 0.7623872	total: 36m 28s	remaining: 23m 54s
302:	learn: 0.7624416	total: 36m 35s	remaining: 23m 47s
303:	learn: 0.7625231	total: 36m 41s	remaining: 23m 39s
304:	learn: 0.7626377	total: 36m 48s	remaining: 23m 32s
305:	learn: 0.7627112	total: 36m 56s	remaining: 23m 25s
306:	learn: 0.7627917	total: 37m 4s	remaining: 23m 18s
307:	learn: 0.7628559	total: 37m 10s	remaining: 23m 10s
308:	learn: 0.7629351	total: 37m 18s	remaining: 23m 3s
309:	learn: 0.7630013	total: 37m 25s	remaining: 22m 56s
310:	learn: 0.7630696	total: 37m 32s	remaining: 22m 48s
311:	learn: 0.7631161	total: 37m 39s	remaining: 22m 41s
312:	learn: 0.7631780	total: 37m 45s	remaining: 22m 33s
313:	learn: 0.7632581	total: 37m 52s	remaining: 22m 26s
314:	learn: 0.7633317	total: 37m 58s	remaining: 22m 18s
315:	learn: 0.7633649	total: 38m 5s	remaining: 22m 10s
316:	learn: 0.7634325	total: 38m 12s	remaining: 22m 3

447:	learn: 0.7705615	total: 53m 57s	remaining: 6m 15s
448:	learn: 0.7706053	total: 54m 4s	remaining: 6m 8s
449:	learn: 0.7706422	total: 54m 12s	remaining: 6m 1s
450:	learn: 0.7706778	total: 54m 18s	remaining: 5m 54s
451:	learn: 0.7707238	total: 54m 26s	remaining: 5m 46s
452:	learn: 0.7707725	total: 54m 33s	remaining: 5m 39s
453:	learn: 0.7708136	total: 54m 41s	remaining: 5m 32s
454:	learn: 0.7708627	total: 54m 48s	remaining: 5m 25s
455:	learn: 0.7708746	total: 54m 55s	remaining: 5m 18s
456:	learn: 0.7708833	total: 55m 3s	remaining: 5m 10s
457:	learn: 0.7709438	total: 55m 9s	remaining: 5m 3s
458:	learn: 0.7709679	total: 55m 15s	remaining: 4m 56s
459:	learn: 0.7710226	total: 55m 23s	remaining: 4m 49s
460:	learn: 0.7710708	total: 55m 30s	remaining: 4m 41s
461:	learn: 0.7711191	total: 55m 37s	remaining: 4m 34s
462:	learn: 0.7711730	total: 55m 44s	remaining: 4m 27s
463:	learn: 0.7712056	total: 55m 52s	remaining: 4m 20s
464:	learn: 0.7712440	total: 55m 59s	remaining: 4m 12s
465:	learn: 0.77

<catboost.core.CatBoostClassifier at 0x1e91fedcf28>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
param_test = {
 'iterations':range(400,1000,100),
 'depth':range(1,9,2)
}
gsearch = RandomizedSearchCV(estimator = CatBoostClassifier(iterations=500, learning_rate=0.03, eval_metric="AUC", depth=5, loss_function='Logloss'), 
 param_distributions = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch.fit(X,y)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [14]:
preds = model.predict_proba(X_test)

In [22]:
pd.DataFrame({"id":id_test, "project_is_approved": preds[:,0]}).to_csv("subm_500trees.csv", index=False)

# LGBM solutions

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, RepeatedKFold
import lightgbm as lgb
import gc
cnt = 0
p_buf = []
n_splits = 5
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=0)
auc_buf = []   

for train_index, valid_index in kf.split(X):
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 12,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 1,
        'lambda_l2': 1,
        'min_gain_to_split': 0,
    }  

    
    model = lgb.train(
        params,
        lgb.Dataset(X.loc[train_index], y.loc[train_index], feature_name=feature_names),
        num_boost_round=10000,
        valid_sets=[lgb.Dataset(X.loc[valid_index], y.loc[valid_index])],
        early_stopping_rounds=100,
        verbose_eval=100,
    )

    if cnt == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        print(tuples[:50])

    p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
    auc = roc_auc_score(y.loc[valid_index], p)

    print('{} AUC: {}'.format(cnt, auc))

    p = model.predict(X_test, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p)
    else:
        p_buf += np.array(p)
    auc_buf.append(auc)

    cnt += 1
    #if cnt > 0: # Comment this to run several folds
        #break
    
    del model
    gc.collect

auc_mean = np.mean(auc_buf)
auc_std = np.std(auc_buf)
print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))

preds = p_buf/cnt

Fold 1/5
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's auc: 0.741122
[200]	valid_0's auc: 0.758622
[300]	valid_0's auc: 0.766712
[400]	valid_0's auc: 0.77056
