## Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import os; os.chdir('/Users/quannguyen/Downloads/kaggle-data/app-screening/code/')
import gc

## Preprocessing Training and Testind Data

In [4]:
dtype = {
    'id': str,
    'teacher_id': str,
    'teacher_prefix': str,
    'school_state': str,
    'project_submitted_datetime': str,
    'project_grade_category': str,
    'project_subject_categories': str,
    'project_subject_subcategories': str,
    'project_title': str,
    'project_essay_1': str,
    'project_essay_2': str,
    'project_essay_3': str,
    'project_essay_4': str,
    'project_resource_summary': str,
    'teacher_number_of_previously_posted_projects': int,
    'project_is_approved': np.uint8
}

train_df = pd.read_csv('../input/train.csv', dtype=dtype, index_col=0, low_memory=True)
test_df = pd.read_csv('../input/test.csv', dtype=dtype, index_col=0, low_memory=True)

train_df.head()

### Concatenating project essays

In [5]:
def get_concat_essays(df):
    return df.apply(lambda row: ' '.join([
        str(row['project_essay_1']),
        str(row['project_essay_2']),
        str(row['project_essay_3']),
        str(row['project_essay_4'])
    ]), axis=1)

train_df['project_essay'] = get_concat_essays(train_df)
test_df['project_essay'] = get_concat_essays(test_df)

### Extracting essay lengths

In [6]:
def add_essay_lengths(df):
    df['project_title_len'] = df['project_title'].apply(lambda x: len(str(x)))
    
    df['project_essay_1_len'] = df['project_essay_1'].apply(lambda x: len(str(x)))
    df['project_essay_2_len'] = df['project_essay_2'].apply(lambda x: len(str(x)))
    df['project_essay_3_len'] = df['project_essay_3'].apply(lambda x: len(str(x)))
    df['project_essay_4_len'] = df['project_essay_4'].apply(lambda x: len(str(x)))
    df['project_essay_len'] = df['project_essay'].apply(lambda x: len(str(x)))
    
    df['project_resource_summary_len'] = df['project_resource_summary'].apply(lambda x: len(str(x)))

add_essay_lengths(train_df)
add_essay_lengths(test_df)

train_df.head()

Unnamed: 0_level_0,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,...,teacher_number_of_previously_posted_projects,project_is_approved,project_essay,project_title_len,project_essay_1_len,project_essay_2_len,project_essay_3_len,project_essay_4_len,project_essay_len,project_resource_summary_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p036502,484aaf11257089a66cfedc9461c6bd0a,Ms.,NV,2016-11-18 14:45:59,Grades PreK-2,Literacy & Language,Literacy,Super Sight Word Centers,Most of my kindergarten students come from low...,I currently have a differentiated sight word c...,...,26,1,Most of my kindergarten students come from low...,24,967,805,3,3,1781,127
p039565,df72a3ba8089423fa8a94be88060f6ed,Mrs.,GA,2017-04-26 15:57:28,Grades 3-5,"Music & The Arts, Health & Sports","Performing Arts, Team Sports",Keep Calm and Dance On,Our elementary school is a culturally rich sch...,We strive to provide our diverse population of...,...,1,0,Our elementary school is a culturally rich sch...,22,587,639,3,3,1235,81
p233823,a9b876a9252e08a55e3d894150f75ba3,Ms.,UT,2017-01-01 22:57:44,Grades 3-5,"Math & Science, Literacy & Language","Applied Sciences, Literature & Writing",Lets 3Doodle to Learn,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,We are looking to add some 3Doodler to our cla...,...,5,1,Hello;\r\nMy name is Mrs. Brotherton. I teach ...,21,761,546,3,3,1316,186
p185307,525fdbb6ec7f538a48beebaa0a51b24f,Mr.,NC,2016-08-12 15:42:11,Grades 3-5,Health & Sports,Health & Wellness,"\""Kid Inspired\"" Equipment to Increase Activit...",My students are the greatest students but are ...,"The student's project which is totally \""kid-i...",...,16,0,My students are the greatest students but are ...,72,1201,1209,3,3,2419,233
p013780,a63b5547a7239eae4c1872670848e61a,Mr.,CA,2016-08-06 09:09:11,Grades 6-8,Health & Sports,Health & Wellness,We need clean water for our culinary arts class!,My students are athletes and students who are ...,For some reason in our kitchen the water comes...,...,42,1,My students are athletes and students who are ...,48,451,556,3,3,1016,71


### Dropping unsued columns

In [7]:
train_df = train_df.drop(['project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4'], axis=1)
test_df = test_df.drop(['project_essay_1', 'project_essay_2', 'project_essay_3', 'project_essay_4'], axis=1)

gc.collect()

72

## Incorporating Resource Dataset

In [23]:
res_df = pd.read_csv('../input/resources.csv')

res_df.head()

Unnamed: 0,id,description,quantity,price
0,p233245,LC652 - Lakeshore Double-Space Mobile Drying Rack,1,149.0
1,p069063,Bouncy Bands for Desks (Blue support pipes),3,14.95
2,p069063,Cory Stories: A Kid's Book About Living With Adhd,1,8.45
3,p069063,"Dixon Ticonderoga Wood-Cased #2 HB Pencils, Bo...",2,13.59
4,p069063,EDUCATIONAL INSIGHTS FLUORESCENT LIGHT FILTERS...,3,24.95


In [25]:
res_df = pd.DataFrame(res_df[['id', 'price']].groupby('id')['price'].agg([
    'count',
    'sum',
    'min',
    'max',
    'mean',
    'std',
    lambda x: len(np.unique(x))
])).reset_index()

res_df.head()

Unnamed: 0,id,count,sum,min,max,mean,std,<lambda>
0,p000001,4,459.56,23.99,261.08,114.89,101.929679,4.0
1,p000002,14,515.89,8.46,134.9,36.849286,33.549557,13.0
2,p000003,4,298.97,39.99,169.0,74.7425,63.014906,3.0
3,p000004,95,1113.69,1.6,401.54,11.723053,40.608577,36.0
4,p000005,4,485.99,54.08,323.75,121.4975,134.835,2.0
