In [14]:
from numpy import loadtxt, zeros, ones, array, linspace, logspace
from matplotlib.pylab import scatter, show, title, xlabel, ylabel, plot, contour


#Evaluate the linear regression
def compute_cost(X, y, theta):
    '''
    Comput cost for linear regression
    '''
    #Number of training samples
    m = y.size

    predictions = X.dot(theta).flatten()

    sqErrors = (predictions - y) ** 2

    J = (1.0 / (2 * m)) * sqErrors.sum()

    return J


def gradient_descent(X, y, theta, alpha, num_iters):
    '''
    Performs gradient descent to learn theta
    by taking num_items gradient steps with learning
    rate alpha
    '''
    m = y.size
    J_history = zeros(shape=(num_iters, 1))

    for i in range(num_iters):

        predictions = X.dot(theta).flatten()

        errors_x1 = (predictions - y) * X[:, 0]
        errors_x2 = (predictions - y) * X[:, 1]

        theta[0][0] = theta[0][0] - alpha * (1.0 / m) * errors_x1.sum()
        theta[1][0] = theta[1][0] - alpha * (1.0 / m) * errors_x2.sum()

        J_history[i, 0] = compute_cost(X, y, theta)

    return theta, J_history


#Load the dataset
# data = loadtxt('ex1data1.txt', delimiter=',')

# # #Plot the data
# scatter(data[:, 0], data[:, 1], marker='o', c='b')
# title('Profits distribution')
# xlabel('Population of City in 10,000s')
# ylabel('Profit in $10,000s')
# show()

X = data[:, 0]
y = data[:, 1]

# print(X)

# print(y)


#number of training samples
m = y.size

#Add a column of ones to X (interception data)
it = ones(shape=(m, 2))
it[:, 1] = X

# print(it)

# #Initialize theta parameters
theta = zeros(shape=(2, 1))

# #Some gradient descent settings
iterations = 1500
alpha = 0.01

# #compute and display initial cost
print(compute_cost(it, y, theta))

theta, J_history = gradient_descent(it, y, theta, alpha, iterations)

print(theta)

# #Predict values for population sizes of 35,000 and 70,000
# predict1 = array([1, 3.5]).dot(theta).flatten()
# print('For population = 35,000, we predict a profit of %f' % (predict1 * 10000))
# predict2 = array([1, 7.0]).dot(theta).flatten()
# print('For population = 70,000, we predict a profit of %f' % (predict2 * 10000))

# #Plot the results
# result = it.dot(theta).flatten()
# plot(data[:, 0], result)
# show()


# #Grid over which we will calculate J
theta0_vals = linspace(-10, 10, 100)
theta1_vals = linspace(-1, 4, 100)


# #initialize J_vals to a matrix of 0's
J_vals = zeros(shape=(theta0_vals.size, theta1_vals.size))

# #Fill out J_vals
for t1, element in enumerate(theta0_vals):
    for t2, element2 in enumerate(theta1_vals):
        thetaT = zeros(shape=(2, 1))
        thetaT[0][0] = element
        thetaT[1][0] = element2
        J_vals[t1, t2] = compute_cost(it, y, thetaT)

# #Contour plot
J_vals = J_vals.T
#Plot J_vals as 15 contours spaced logarithmically between 0.01 and 100
contour(theta0_vals, theta1_vals, J_vals, logspace(-2, 3, 20))
xlabel('theta_0')
ylabel('theta_1')
scatter(theta[0][0], theta[1][0])
show()

32.0727338775
[[-3.63029144]
 [ 1.16636235]]


In [16]:
from numpy import loadtxt, zeros, ones, array, linspace, logspace, mean, std, arange
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from pylab import plot, show, xlabel, ylabel

In [19]:
def feature_normalize(X):
    '''
    Returns a normalized version of X where
    the mean value of each feature is 0 and the standard deviation
    is 1. This is often a good preprocessing step to do when
    working with learning algorithms.
    '''
    mean_r = []
    std_r = []

    X_norm = X

    n_c = X.shape[1]
    for i in range(n_c):
        m = mean(X[:, i])
        s = std(X[:, i])
        mean_r.append(m)
        std_r.append(s)
        X_norm[:, i] = (X_norm[:, i] - m) / s

    return X_norm, mean_r, std_r


def compute_cost(X, y, theta):
    '''
    Comput cost for linear regression
    '''
    #Number of training samples
    m = y.size

    predictions = X.dot(theta)

    sqErrors = (predictions - y)

    J = (1.0 / (2 * m)) * sqErrors.T.dot(sqErrors)

    return J


def gradient_descent(X, y, theta, alpha, num_iters):
    '''
    Performs gradient descent to learn theta
    by taking num_items gradient steps with learning
    rate alpha
    '''
    m = y.size
    J_history = zeros(shape=(num_iters, 1))

    for i in range(num_iters):

        predictions = X.dot(theta)

        theta_size = theta.size

        for it in range(theta_size):

            temp = X[:, it]
            temp.shape = (m, 1)

            errors_x1 = (predictions - y) * temp

            theta[it][0] = theta[it][0] - alpha * (1.0 / m) * errors_x1.sum()

        J_history[i, 0] = compute_cost(X, y, theta)

    return theta, J_history

#Load the dataset
data = loadtxt('ex1data2.txt', delimiter=',')


#Plot the data
'''
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
n = 100
for c, m, zl, zh in [('r', 'o', -50, -25)]:
    xs = data[:, 0]
    ys = data[:, 1]
    zs = data[:, 2]
    ax.scatter(xs, ys, zs, c=c, marker=m)
ax.set_xlabel('Size of the House')
ax.set_ylabel('Number of Bedrooms')
ax.set_zlabel('Price of the House')
plt.show()
'''


X = data[:, :2]
y = data[:, 2]


#number of training samples
m = y.size

y.shape = (m, 1)

#Scale features and set them to zero mean
x, mean_r, std_r = feature_normalize(X)

#Add a column of ones to X (interception data)
it = ones(shape=(m, 3))
it[:, 1:3] = x

#Some gradient descent settings
iterations = 100
alpha = 0.01

#Init Theta and Run Gradient Descent
theta = zeros(shape=(3, 1))

theta, J_history = gradient_descent(it, y, theta, alpha, iterations)
print(theta, J_history)
plot(arange(iterations), J_history)
xlabel('Iterations')
ylabel('Cost Function')
# show()

#Predict price of a 1650 sq-ft 3 br house
price = array([1.0,   ((1650.0 - mean_r[0]) / std_r[0]), ((3 - mean_r[1]) / std_r[1])]).dot(theta)
print('Predicted price of a 1650 sq-ft, 3 br house: %f' % (price))

[[ 215810.61679138]
 [  61446.18781361]
 [  20070.13313796]] [[  6.42977763e+10]
 [  6.30310183e+10]
 [  6.17906942e+10]
 [  6.05762369e+10]
 [  5.93870917e+10]
 [  5.82227165e+10]
 [  5.70825809e+10]
 [  5.59661664e+10]
 [  5.48729661e+10]
 [  5.38024840e+10]
 [  5.27542352e+10]
 [  5.17277455e+10]
 [  5.07225512e+10]
 [  4.97381986e+10]
 [  4.87742442e+10]
 [  4.78302542e+10]
 [  4.69058042e+10]
 [  4.60004793e+10]
 [  4.51138734e+10]
 [  4.42455895e+10]
 [  4.33952394e+10]
 [  4.25624431e+10]
 [  4.17468290e+10]
 [  4.09480338e+10]
 [  4.01657018e+10]
 [  3.93994853e+10]
 [  3.86490441e+10]
 [  3.79140452e+10]
 [  3.71941632e+10]
 [  3.64890795e+10]
 [  3.57984824e+10]
 [  3.51220671e+10]
 [  3.44595353e+10]
 [  3.38105953e+10]
 [  3.31749615e+10]
 [  3.25523545e+10]
 [  3.19425011e+10]
 [  3.13451339e+10]
 [  3.07599912e+10]
 [  3.01868171e+10]
 [  2.96253609e+10]
 [  2.90753776e+10]
 [  2.85366274e+10]
 [  2.80088755e+10]
 [  2.74918923e+10]
 [  2.69854530e+10]
 [  2.64893378e+10]

In [20]:
import numpy as np

In [22]:
a = np.array([[1,2,3], [4,5,6], [7,8,9]])

In [25]:
a.ravel()

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [26]:
a.T

array([[1, 4, 7],
       [2, 5, 8],
       [3, 6, 9]])

In [27]:
a.T.ravel()

array([1, 4, 7, 2, 5, 8, 3, 6, 9])

In [28]:
import pandas as pd

In [33]:
district_housing = pd.read_csv('Purchase_order_FY15.csv', skip_footer = 1, engine = 'python')

In [34]:
district_housing

Unnamed: 0,Agency - Agency,Commodity - Commodity,Supplier - Common Supplier,Ordered Date,PO #,PO amount
0,EQUIPMENT LEASE - CAPITAL,9182930:CONSULTING SERVICES RELATED TO THE IMP...,"OST, Inc.",23-Oct-14,PO508160,29932.80
1,METROPOLITAN POLICE DEPARTMENT,9182930:CONSULTING SERVICES RELATED TO THE IMP...,"OST, Inc.",25-Nov-14,PO505678-V2,134608.35
2,METROPOLITAN POLICE DEPARTMENT,9625800:Professional Services (Not Otherwise C...,ABC TOWING INC,1-Oct-14,PO505398,37500.00
3,DEPARTMENT OF GENERAL SERVICES,9582600:Construction Management Services,Kramer Consulting Services PC,28-Jan-15,PO514707,175000.00
4,OFFICE OF THE CHIEF FINANCIAL OFFICER,9204520:MAINTENANCE AND LICENSE AGREEMENT SERV...,"TEAM TECHNOLOGY, INC",15-Oct-14,PO483678-V2,6500.00
5,OFFICE OF THE CHIEF FINANCIAL OFFICER,"6003850:COPIERS, COLOR, NEW",SHARP ELECTRONICS CORPORATION,28-Oct-14,PO498255-V4,3460.00
6,OFFICE OF THE CHIEF FINANCIAL OFFICER,"9182810:CONSULTANT SERVICES, COMPUTER SYSTEMS/...","MVS, Inc.",6-Mar-15,PO516523,13000.00
7,DEPARTMENT OF PUBLIC WORKS,"9980900:Automobile, Truck and Bus Parts and Eq...","E & M AUTO REPAIR, INC.",1-Oct-14,PO505847,50000.00
8,DEPARTMENT OF PUBLIC WORKS,"8000800:Boots, Leather",SAF GARD SAFETY SHOE CO INC,16-Oct-14,PO507394,15000.00
9,DEPARTMENT OF PUBLIC WORKS,"9242500:For Credit Classes, Seminars, Workshop...",VILLANOVA UNIVERSITY,15-Jan-15,PO509370-V2,5470.00


In [35]:
district_housing['Agency - Agency'].unique()

array(['EQUIPMENT LEASE - CAPITAL', 'METROPOLITAN POLICE DEPARTMENT',
       'DEPARTMENT OF GENERAL SERVICES',
       'OFFICE OF THE CHIEF FINANCIAL OFFICER',
       'DEPARTMENT OF PUBLIC WORKS',
       'HOMELAND SECURITY/EMERGENCY MANAGEMENT',
       'OFFICE OF THE ATTORNEY GENERAL',
       'D.C HEALTH BENEFIT EXCHANGE AUTHORITY',
       'OFFICE OF THE CHIEF TECHNOLOGY OFFICER',
       'DEPT. OF HOUSING AND COMM. DEVELOPMENT', 'D.C. NATIONAL GUARD',
       'DEPARTMENT OF BEHAVIORAL HEALTH', 'DEPARTMENT OF CORRECTIONS',
       'DEPARTMENT OF PARKS AND RECREATION',
       'DEPT. OF CONSUMER AND REGULATORY AFFAIRS', 'DEPARTMENT OF HEALTH',
       'DEPARTMENT ON DISABILITY SERVICES', 'DEPARTMENT OF TRANSPORTATION',
       'COMMISSION ON ARTS & HUMANITIES',
       'STATE SUPERINTENDENT OF EDUCATION (OSSE)',
       'OFFICE OF UNIFIED COMMUNICATIONS',
       'DISTRICT OF COLUMBIA PUBLIC SCHOOLS', 'BOARD OF ELECTIONS',
       'DEPARTMENT OF FORENSICS SCIENCES',
       'OFFICE OF CONTRACTING A

In [36]:
po_amount = district_housing["PO amount"]

In [37]:
po_amount

0      29,932.80
1     134,608.35
2      37,500.00
3     175,000.00
4       6,500.00
5       3,460.00
6      13,000.00
7      50,000.00
8      15,000.00
9       5,470.00
10      4,301.22
11    100,000.00
12         139.8
13           900
14           175
...
22648     12,000.00
22649      7,500.00
22650         598.5
22651        207.74
22652      3,030.00
22653         38.08
22654    325,000.00
22655         71.35
22656      1,240.00
22657      7,734.15
22658      5,000.00
22659      1,257.10
22660          42.4
22661          9.69
22662          39.9
Name: PO amount, Length: 22663, dtype: object

In [38]:
po_amount.value_counts()[:10]

0             466
10,000.00     277
5,000.00      226
50,000.00     182
100,000.00    173
20,000.00     166
500           164
15,000.00     132
30,000.00     118
3,000.00      109
dtype: int64

In [43]:
np.random.seed(0)

In [44]:
print(np.random.seed(0))

None


In [45]:
X = np.random.random(size = (20,1))

In [46]:
print(X)

[[ 0.5488135 ]
 [ 0.71518937]
 [ 0.60276338]
 [ 0.54488318]
 [ 0.4236548 ]
 [ 0.64589411]
 [ 0.43758721]
 [ 0.891773  ]
 [ 0.96366276]
 [ 0.38344152]
 [ 0.79172504]
 [ 0.52889492]
 [ 0.56804456]
 [ 0.92559664]
 [ 0.07103606]
 [ 0.0871293 ]
 [ 0.0202184 ]
 [ 0.83261985]
 [ 0.77815675]
 [ 0.87001215]]


In [47]:
Y = y = 3 * X[:, 0] + 2 + np.random.normal(size=20)

In [48]:
Y

array([ 5.14051958,  3.94040984,  4.12135783,  2.78055381,  0.71797458,
        4.59130093,  4.17719783,  3.93315398,  7.16074291,  1.69595888,
        4.42093363,  3.39950091,  5.2369129 ,  6.24614868,  2.3680556 ,
        2.63955042,  1.17286944,  2.51706307,  3.9865581 ,  4.76638541])

In [49]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept = True)

In [50]:
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [51]:
print("Model coefficient: %.5f, and intercept: %.5f"
      % (model.coef_, model.intercept_))

Model coefficient: 3.93491, and intercept: 1.46229


In [55]:
from matplotlib import pyplot as plt

In [63]:
# Plot the data and the model prediction
X_test = np.linspace(0, 1, 100)[:, np.newaxis]
y_test = model.predict(X_test)

# plt.plot(X[:, 0], y, 'o')
# plt.plot(X_test[:, 0], y_test)
# plt.title('Linear regression with a single input variable');

In [1]:
import json, re
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc

In [2]:
dfs = {}
for name in ['train', 'test']:
    df = pd.read_json('%s.json' % name)
    df['_data'] = name
    dfs[name] = df

# combine train and test data into one df
df = dfs['train'].append(dfs['test'])
df = df.reset_index(drop=True)

In [3]:
df

Unnamed: 0,_data,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,train,,0,1,0,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,...,False,[],0,1,0,1,,nickylvst,1317852607,1317849007
1,train,,2,5,0,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652424,1332648824
2,train,,0,3,0,t3_lpu5j,0,My girlfriend decided it would be a good idea ...,My girlfriend decided it would be a good idea ...,"[Request] Hungry couple in Dundee, Scotland wo...",...,False,[],0,3,0,3,,jacquibatman7,1319650094,1319646494
3,train,,0,1,1,t3_mxvj3,4,"It's cold, I'n hungry, and to be completely ho...","It's cold, I'n hungry, and to be completely ho...","[Request] In Canada (Ontario), just got home f...",...,False,"[AskReddit, DJs, IAmA, Random_Acts_Of_Pizza]",54,59,76,81,,4on_the_floor,1322855434,1322855434
4,train,,6,6,0,t3_1i6486,5,hey guys:\n I love this sub. I think it's grea...,hey guys:\n I love this sub. I think it's grea...,[Request] Old friend coming to visit. Would LO...,...,False,"[GayBrosWeightLoss, RandomActsOfCookies, Rando...",1121,1225,1733,1887,,Futuredogwalker,1373657691,1373654091
5,train,,3,4,0,t3_14gmeb,0,Feeling under the weather so I called out off ...,Feeling under the weather so I called out off ...,[REQUEST] I'll give a two week xbox live code ...,...,True,"[AdviceAnimals, AskReddit, Autos, IAmA, Random...",234,533,814,1207,shroom,jamespweb,1354911700,1354911700
6,train,,1,2,1342028318,t3_wcw5m,3,We're in Tampa Florida...moving to Ybor on Fri...,We're in Tampa Florida...moving to Ybor on Fri...,[Request] Help me give back to my roomies on F...,...,False,[],0,16,0,28,,usftampa,1341969695,1341966095
7,train,,2,6,0,t3_of16d,6,"(Request) I have given a few things on reddit,...","(Request) I have given a few things on reddit,...","random acts of pizza, i have a request, if not...",...,False,"[AskReddit, IAmA, Music, Random_Acts_Of_Pizza,...",1153,14548,7055,35612,,thebraus,1326429793,1326429793
8,train,,0,1,0,t3_1ioo1k,0,"Wasnt really sure what to put as the title, un...","Wasnt really sure what to put as the title, un...","[Request] Queensland Australia, Recently moved...",...,False,"[AlisonBrie, AskReddit, Games, IAmA, Naruto, N...",189,253,331,475,,The__Doctor__,1374309042,1374305442
9,train,,0,6,1,t3_k0l9j,21,"Austin, Texas\n\nMy two roommates and I are hu...","Austin, Texas\n\nMy two roommates and I are hu...",[REQUEST]We're in need of some om noms...,...,True,[AskReddit],6,26,6,28,shroom,biffle,1314827982,1314824382


In [3]:
# # # limit to shared columns (plus predictor)
cols = list(dfs['test'].columns) + ['requester_received_pizza']
df = df[cols]

list(df.columns)

# #  rename a few columns 
# df.rename(columns={
        
#         'request_title': 'title', 
#         'request_text_edit_aware': 'body',
#         'requester_received_pizza': 'got_pizza',
        
# }, inplace=True)

# # convert got pizza indicator to ints
# df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))

[u'giver_username_if_known',
 u'request_id',
 u'request_text_edit_aware',
 u'request_title',
 u'requester_account_age_in_days_at_request',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_subreddits_at_request',
 u'requester_subreddits_at_request',
 u'requester_upvotes_minus_downvotes_at_request',
 u'requester_upvotes_plus_downvotes_at_request',
 u'requester_username',
 u'unix_timestamp_of_request',
 u'unix_timestamp_of_request_utc',
 '_data',
 u'requester_received_pizza']

In [4]:
df['requester_received_pizza'] = df['requester_received_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))

In [5]:
cols_to_keep = ['_data', 'request_id', 'request_title', 'request_text_edit_aware', 'requester_received_pizza']

In [6]:
df = df[cols_to_keep]

In [7]:
df.iloc[:]

Unnamed: 0,_data,request_id,request_title,request_text_edit_aware,requester_received_pizza
0,train,t3_l25d7,Request Colorado Springs Help Us Please,Hi I am in need of food for my 4 children we a...,0
1,train,t3_rcb83,"[Request] California, No cash and I could use ...",I spent the last money I had on gas today. Im ...,0
2,train,t3_lpu5j,"[Request] Hungry couple in Dundee, Scotland wo...",My girlfriend decided it would be a good idea ...,0
3,train,t3_mxvj3,"[Request] In Canada (Ontario), just got home f...","It's cold, I'n hungry, and to be completely ho...",0
4,train,t3_1i6486,[Request] Old friend coming to visit. Would LO...,hey guys:\n I love this sub. I think it's grea...,0
5,train,t3_14gmeb,[REQUEST] I'll give a two week xbox live code ...,Feeling under the weather so I called out off ...,1
6,train,t3_wcw5m,[Request] Help me give back to my roomies on F...,We're in Tampa Florida...moving to Ybor on Fri...,0
7,train,t3_of16d,"random acts of pizza, i have a request, if not...","(Request) I have given a few things on reddit,...",0
8,train,t3_1ioo1k,"[Request] Queensland Australia, Recently moved...","Wasnt really sure what to put as the title, un...",0
9,train,t3_k0l9j,[REQUEST]We're in need of some om noms...,"Austin, Texas\n\nMy two roommates and I are hu...",1


In [8]:
df['text'] = df['request_title'] + '' + df['request_text_edit_aware']

In [9]:
for col in ['request_title', 'request_text_edit_aware', 'text']:
    print(df.iloc[0][col])
    print('--')

Request Colorado Springs Help Us Please
--
Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
--
Request Colorado Springs Help Us PleaseHi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated
--


In [10]:
from nltk.corpus import stopwords

def clean_txt(raw, remove_stop=False):
    # remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", raw) 

    # convert to lower case, split into individual words
    words = letters_only.lower().split()                             

    # remove stop words
    stops = set(nltk.stopwords.words("english"))
    words = [w for w in words if not w in stops]
    
    # join cleaned words
    return " ".join(words)

In [11]:
# prep training set data for modeling
# creates numerical arrays for X (bag of words) and y (got pizza)

def get_xy(vectorizer=None, txt_col='text'):
    if vectorizer is None:
        vectorizer = CountVectorizer()
        
    dg = df[df['_data'] == 'train']

    X = vectorizer.fit_transform(dg[txt_col]).toarray()
    y = dg['requester_received_pizza'].astype(int).as_matrix()

    return X, y

In [12]:
X, y = get_xy()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

model = MultinomialNB().fit(X_train, y_train)

print("Accuracy on training data: %f" % model.score(X_train, y_train))
print("Accuracy on test data: %f"% model.score(X_test, y_test))

y_pred = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print("AUC: %f" % auc(fpr, tpr))

Accuracy on training data: 0.874587
Accuracy on test data: 0.722772
AUC: 0.498224


In [13]:
# whoa, not very good - an roc auc just a little bit better
# than random chance and a big difference in error rates
# between training and test data implying overfitting

# let's calibrate some params (in vectorizer & classifier)
# and choose values that maximize roc auc

# the grid of params to search over
alphas = [1, 5, 10, 25]
min_dfs = [0.001, 0.01, 0.02, 0.05]

# loop through values to find optimal settings
best_alpha, best_min_df = None, None
max_auc = -np.inf

for alpha in alphas:
    for min_df in min_dfs:
        
        vectorizer = CountVectorizer(min_df = min_df)

        X, y = get_xy(vectorizer)
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

        model = MultinomialNB(alpha=alpha).fit(X_train, y_train)

        y_pred = model.predict_proba(X_test)[:, 1]        
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        auc_val = auc(fpr, tpr)

        if auc_val > max_auc:
            max_auc = auc_val
            best_alpha, best_min_df = alpha, min_df 

                
print("alpha: %f" % best_alpha)
print("min_df: %f" % best_min_df)
print("best auc: %f" % max_auc)

alpha: 5.000000
min_df: 0.010000
best auc: 0.593063


In [14]:
# let's make sure this new model is less over-fit

vectorizer = CountVectorizer(min_df = best_min_df)

X, y = get_xy(vectorizer)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

model = MultinomialNB(alpha=best_alpha).fit(X_train, y_train)

print("Accuracy on training data: %f" % model.score(X_train, y_train))
print("Accuracy on test data:     %f" % model.score(X_test, y_test))

Accuracy on training data: 0.773597
Accuracy on test data:     0.719802


In [15]:
# finally, let's train on full training set with best params
# and save our predictions for submission

vectorizer = CountVectorizer(min_df = best_min_df)

X, y = get_xy(vectorizer)

model = MultinomialNB(alpha=best_alpha).fit(X, y)

df_test = df[df['_data'] == 'test'].copy()
X_test = vectorizer.transform(df_test['text'])
y_pred = model.predict_proba(X_test)[:, 1]

df_test['requester_received_pizza'] = y_pred
final_df = df_test[['request_id', 'requester_received_pizza']]

# sanity check entries 
print(final_df.head(5))

     request_id  requester_received_pizza
4040   t3_i8iy4                  0.234551
4041  t3_1mfqi0                  0.350563
4042   t3_lclka                  0.346996
4043  t3_1jdgdj                  0.076687
4044   t3_t2qt4                  0.045623


In [17]:
final_df.to_csv('predicted.csv', index=False)

In [29]:
words = np.array(vectorizer.get_feature_names())

x = np.eye(X.shape[1])
probs = model.predict_proba(x)[:, 1]

word_df = pd.DataFrame()
word_df['word'] = words
word_df['P(pizza | word)'] = probs
word_df.sort('P(pizza | word)', ascending=False, inplace=True)

print('successful words')
print(word_df.head(10))
print('\n---\n')
print('unsuccessful words')
print(word_df.tail(10))

successful words
           word  P(pizza | word)
205    exchange         0.441715
445    normally         0.433080
352         jpg         0.426822
627      sunday         0.420380
211      father         0.419746
89        bucks         0.412901
44         aren         0.404114
195  especially         0.404114
17          aid         0.401671
50   assistance         0.395977

---

unsuccessful words
          word  P(pizza | word)
243    friends         0.165136
536  redditors         0.165136
139       cool         0.162870
712     wallet         0.161179
43        area         0.160937
582    sitting         0.160123
621   studying         0.159352
239       free         0.158863
242     friend         0.142384
222      final         0.120445
