In [1]:
import re
import os
import sys
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import date

In [3]:
def build_tfidf_models_cv(df, include_val=False, y="y", givens = None, cv=10, text_vars=["title", "short_description", "need_statement", "essay"]):
    df["r"] = np.random.uniform(0,1,size=len(df))
    if include_val:
        split_tr = (df["split"]=="train") | (df["split"]=="val")
    else:
        split_tr = df["split"]=="train"
    if givens:
        for g in givens:
            split_tr = split_tr & (df[g]==1)   
    y_train = df[y][split_tr].values
    probs = np.arange(0,1. + 1./cv, 1./cv)
    for var in text_vars:
        new_var_name = var+"_pred_partial"
        df[new_var_name] = 0.0
        vectorizer = TfidfVectorizer(min_df=2,
                                     use_idf=1,
                                     smooth_idf=1,
                                     sublinear_tf=1,
                                     ngram_range=(1,2), 
                                     token_pattern=r"(?u)\b[A-Za-z0-9()\'\-?!\"%]+\b",
                                     norm='l2')    
        vectorizer.fit(df[var][(df["split"]=="train") | (df["split"]=="val") | (df["split"]=="test")])
        tfidf_train = vectorizer.transform(df[var][split_tr])
        tfidf_all = vectorizer.transform(df[var])        
        lm_model = SGDClassifier(penalty="l2",loss="log",fit_intercept=True, shuffle=True,n_iter=20, n_jobs=-1,alpha=0.000005)
        lm_model.fit(tfidf_train, y_train)
        df[var+"_pred"] = lm_model.predict_proba(tfidf_all)[:,1]
        for i in range(cv):             
            split_train_test = (split_tr) & ((df['r']>=probs[i]) & (df['r']<probs[i+1]))            
            split_train_train = (split_tr) & ((df['r']<probs[i]) | (df['r']>=probs[i+1]))          
            lm_model_temp = SGDClassifier(penalty="l2",loss="log",fit_intercept=True, shuffle=True,n_iter=20, n_jobs=-1,alpha=0.000005)
            x_train_train = vectorizer.transform(df[var][split_train_train])
            x_train_test = vectorizer.transform(df[var][split_train_test])
            lm_model_temp.fit(x_train_train, df[y][split_train_train].values)                       
            pred_train_test = lm_model_temp.predict_proba(x_train_test)[:,1]
            pred_train_train = lm_model_temp.predict_proba(x_train_train)[:,1]
            df[new_var_name][split_train_test] = pred_train_test                                                
            print('CV: ' + str(i+1))
            print('AUC (Train_Train): ' + str(metrics.roc_auc_score(df['y'][split_train_train],pred_train_train)))
            print('AUC (Train_Test): ' + str(metrics.roc_auc_score(df['y'][split_train_test],pred_train_test)))

In [4]:
def get_pred_partials(df, y="y",id_var="projectid", text_vars=["title", "short_description", "need_statement", "essay"]):
    df2 = pd.DataFrame(df["projectid"])
    for var in text_vars:
        df2[var+"_"+y+"_pred_partial"] = df[var+"_pred_partial"]
        df2[var+"_"+y+"_pred"] = df[var+"_pred"]
    return df2

In [5]:
def get_length(string):
    return len(string.split())

In [6]:
def clean_essay(string, lower=False):
    string = re.sub(r"\\t", " ", string)   
    string = re.sub(r"\\n", " ", string)   
    string = re.sub(r"\\r", " ", string)   
    string = re.sub(r"[^A-Za-z0-9\']", " ", string)   
    string = re.sub(r"\s{2,}", " ", string)
    if lower:
        string = string.lower()
    return string.strip()

In [48]:
def get_prev_exp(df, to_file, var="teacher_acctid", responses=["y","y2","y3","y4","y5","y6","y7","y8","y9","y10"]):
    df = df.sort_values([var, "date_posted"])
    df.index = range(len(df))    
    with open(to_file, "wb") as f:
        #f.write(",".join(["projectid",var,"date_posted",var+"_days"] + [var+"_"+r+"_prev" for r in responses] + [var+"_cnt",var+"_cnt_train"])+"\n")   
        prev_var = 0
        for i, v in enumerate(df[var]):
            if prev_var != v:
                prev_var = v
                count_vec = [0]*(len(responses)+2)
                days_btw = 9999                
                #f.write(",".join([df["projectid"][i],str(v),df["date_posted"][i],str(days_btw)] + [str(count) for count in count_vec])+"\n".encode('utf-8'))
            else:
                prev_date = date(int(df["year"][i-1]), int(df["month"][i-1]), int(df["day"][i-1]))
                now_date = date(int(df["year"][i]), int(df["month"][i]), int(df["day"][i]))
                days_btw = (now_date - prev_date).days
                #f.write(",".join([df["projectid"][i],str(v),df["date_posted"][i],str(days_btw)] + [str(count) for count in count_vec])+"\n".encode('utf-8'))
            for j, r in enumerate(responses):
                count_vec[j] += df[r][i]        
            count_vec[j+1] += 1    
            if df["date_posted"][i] < "2014-01-01":
                count_vec[j+2] += 1

In [18]:
if __name__=="__main__":    
    folder = sys.argv[1]
    outcomes_df = pd.read_csv(r"C:\Users\HP\Desktop\project-1\outcomes_sample.csv")
    projects_df = pd.read_csv(r"C:\Users\HP\Desktop\project-1\projects_sample.csv")

In [19]:
outcomes_df.head()

Unnamed: 0,projectid,is_exciting,at_least_1_teacher_referred_donor,fully_funded,at_least_1_green_donation,great_chat,three_or_more_non_teacher_referred_donors,one_non_teacher_referred_donor_giving_100_plus,donation_from_thoughtful_donor,great_messages_proportion,teacher_referred_count,non_teacher_referred_count
0,b3315d4439660660c57e87f92e510c72,f,f,f,f,t,f,f,f,100.0,0.0,1.0
1,f037e1a96e6df984ff631087cb8e97ec,f,f,t,t,f,t,t,f,44.0,0.0,35.0
2,3f1c2936df9c8fe04391846f4df5c9db,f,f,t,t,t,f,t,f,100.0,0.0,1.0
3,3d68f670cd7bcd3eb371dc08069cbf8d,f,f,t,t,f,f,t,f,,0.0,1.0
4,da4f8b7bacf8d37a6a2cf336150a5869,f,f,t,f,f,f,t,f,0.0,0.0,2.0


In [20]:
projects_df.head()

Unnamed: 0,projectid,teacher_acctid,schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,...,resource_type,poverty_level,grade_level,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,eligible_double_your_impact_match,eligible_almost_home_match,date_posted
0,62526d85d2a1818432d03d600969e99c,ebc7c90b6c92a069432e0714b8d93dfd,5aca9711ff0e4b37db48701f46f73036,171371000000.0,41.972419,-88.174597,Bartlett,IL,60103.0,suburban,...,Other,moderate poverty,Grades 3-5,30.0,444.36,522.78,7.0,f,f,2013-12-31
1,0ff5dec32bf793243a8b0b2c023a81f0,ec5b110df87bd511b508961676d08b6c,72e2b0cb2eecfdc37e67e0eaf10da07b,120198000000.0,30.507978,-86.132003,Freeport,FL,32439.0,rural,...,Technology,highest poverty,Grades PreK-2,30.0,809.32,952.14,17.0,f,f,2013-12-31
2,1f405ea5b8ae8935a4169bdb56006b18,d2b8b908c4a88c48d07439b6d903372e,42d6a0b12a85feabc1eaabce06f7b777,360008700000.0,40.851853,-73.910253,Bronx,NY,10453.0,urban,...,Supplies,highest poverty,Grades 3-5,30.0,200.58,235.98,25.0,t,f,2013-12-31
3,c83d0794e67ec95f680cbb058c4523ff,535ab721f66316cf26a1d5ce1d9184f5,9f37b7e58dae8805882cbae215bb5e2a,120144000000.0,28.548972,-81.162722,Orlando,FL,32828.0,suburban,...,Books,high poverty,Grades 3-5,30.0,269.9,317.53,36.0,f,f,2013-12-31
4,3b6bddd1df0e310066d81c2b89771b5c,60a251a9fe882561405e9f6e135afb75,cf7b5a7b866094184208f0f4bc282e6a,350006000000.0,35.130462,-106.514195,Albuquerque,NM,87111.0,urban,...,Technology,highest poverty,Grades 9-12,30.0,519.03,610.62,160.0,f,f,2013-12-31


In [49]:
df = pd.merge(projects_df, outcomes_df, how='left', on='projectid')
df["split"] = "train"
df["split"][df["date_posted"]<"2010-04-01"] = "none"
df["split"][df["date_posted"]>="2013-01-01"] = "val"
df["split"][df["date_posted"]>="2014-01-01"]= "test"
df = df[df["split"]!="none"]
df["y"] = 0
df["y"][df["is_exciting"]=="t"] = 1
df["y2"] = 0
df["y2"][df["at_least_1_teacher_referred_donor"]=="t"] = 1
df["y3"] = 0
df["y3"][df["great_chat"]=="t"] = 1
df["y4"] = 0
df["y4"][df["fully_funded"]=="t"] = 1
df["y5"] = 0
df["y5"][df["at_least_1_green_donation"]=="t"] = 1
df["y6"] = 0
df["y6"][df["donation_from_thoughtful_donor"]=="t"] = 1
df["y7"] = 0
df["y7"][df["three_or_more_non_teacher_referred_donors"]=="t"] = 1
df["y8"] = 0
df["y8"][df["one_non_teacher_referred_donor_giving_100_plus"]=="t"] = 1
df["y9"] = 0
df["y9"][df["teacher_referred_count"]>=1] = 1
df["y10"] = 0
df["y10"][df["non_teacher_referred_count"]>=1]=1
df["year"] = df["date_posted"].apply(lambda x: x.split("-")[0])
df["month"] = df["date_posted"].apply(lambda x: x.split("-")[1])
df["day"] = df["date_posted"].apply(lambda x: x.split("-")[2])
for var in ["teacher_acctid", "schoolid", "school_district", "school_city", "school_county", "school_zip", "school_state"]:
    get_prev_exp(df,r"C:\Users\HP\Desktop\project-1\_exp_20100401.csv",var,["y","y2","y3","y4","y5","y6","y7","y8","y9","y10"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]<"2010-04-01"] = "none"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]>="2013-01-01"] = "val"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]>="2014-01-01"]= "test"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["y"][d

In [50]:
del df

In [52]:
df = pd.read_csv(r"C:\Users\HP\Desktop\project-1\essay_sample-1.csv")
df = pd.merge(df, outcomes_df, how = 'left', on = 'projectid')
df = pd.merge(df, projects_df, how = 'inner', on = 'projectid')
df["split"] = "train"
df["split"][df["date_posted"]<"2010-04-01"] = "none"
df["split"][df["date_posted"]>="2013-01-01"] = "val"
df["split"][df["date_posted"]>="2014-01-01"]= "test"
df = df[df["split"]!="none"]
df["y"] = 0
df["y"][df["is_exciting"]=="t"] = 1
text_vars=["title", "short_description", "need_statement", "essay"]
for var in text_vars:
    df[var][pd.isnull(df[var])] = ""
    df[var] = df[var].apply(clean_essay)
    df[var+"_length"] = df[var].apply(get_length)    
build_tfidf_models_cv(df, include_val=False, y="y", givens = None, cv=10, text_vars=["title", "short_description", "need_statement", "essay"])
df2 = get_pred_partials(df, y="y",id_var="projectid", text_vars=["title", "short_description", "need_statement", "essay"])
df2["title_length"] = df["title_length"]
df2["essay_length"] = df["essay_length"]
df2.to_csv(os.path.join(folder,"essays_pred_val_y.csv"), index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]<"2010-04-01"] = "none"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]>="2013-01-01"] = "val"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["split"][df["date_posted"]>="2014-01-01"]= "test"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["y"][d

TypeError: __init__() got an unexpected keyword argument 'n_iter'