In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('training.csv', names=['Timestamp', 'Hour', 'Ad', 'Browser', 'Platform', 'Region', 'Clicked'])

In [4]:
# Drop 'Browser' and 'Platform'.
df.drop(['Timestamp', 'Browser', 'Platform'], axis=1, inplace=True)

In [5]:
# Dummy Region to binary features
df_log = pd.concat([df, pd.get_dummies(df['Region'])], axis=1).drop('Region', axis=1)

In [6]:
df_log.head()

Unnamed: 0,Hour,Ad,Clicked,Africa,Asia,Australia,Europe,North America,South America
0,5,Candy Smash,1,0,0,0,0,0,1
1,6,Candy Smash,0,0,0,0,1,0,0
2,4,Clash of Tribes,0,0,0,0,0,1,0
3,7,Clash of Tribes,0,0,0,1,0,0,0
4,16,NBA Jam,0,0,0,0,0,1,0


In [7]:
for col in df_log.columns[3:]:
    df_log[col] =  df_log[col]*df_log['Hour']

In [8]:
for col in df_log.columns[3:]:
    df_dummy = pd.get_dummies(df_log[col])
    df_dummy.columns = [col+'_'+str(x) for x in df_dummy.columns]
    df_log = pd.concat([df_log, df_dummy], axis=1).drop(col, axis=1)

In [9]:
overall_scores = []
for ad in df_log.Ad.unique():
    X = df_log[df_log.Ad == ad].drop(['Hour', 'Ad', 'Clicked'], axis=1)
    y = df_log[df_log.Ad == ad]['Clicked']
    
    model = LogisticRegression()
    scores = cross_val_score(model, X, y, cv=5)
    #overall_scores.append(scores.mean())
    overall_scores.extend(scores)
    print ad, scores, scores.mean()

print sum(overall_scores)/len(overall_scores)

Candy Smash [ 0.77217125  0.72588055  0.77794793  0.73353752  0.78713629] 0.759334710183
Clash of Tribes [ 0.73195876  0.74521355  0.77286136  0.74926254  0.73156342] 0.746171925572
NBA Jam [ 0.59641256  0.59641256  0.6038864   0.62874251  0.5952024 ] 0.604131284697
0.703212640151


In [35]:
overall_scores = []
for ad in df_log.Ad.unique():
    X = df_log[df_log.Ad == ad].drop(['Hour', 'Ad', 'Clicked'], axis=1)
    y = df_log[df_log.Ad == ad]['Clicked']
    
    
    C_list = [round(x,3) for x in list(np.linspace(0.0005, 1, 5))]
    param_grid = [{'C': C_list, 'penalty':['l2']}]
    clf = GridSearchCV(LogisticRegression(), param_grid, cv= 3, n_jobs=1)
    clf.fit(X, y)
    overall_scores.append(clf.best_score_)
    print ad, clf.best_score_
    print clf.best_estimator_

print "\nOverall Score:", sum(overall_scores)/len(overall_scores)

Candy Smash 0.759338640539
LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Clash of Tribes 0.750589622642
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
NBA Jam 0.603830041891
LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Overall Score: 0.70458610169


# Testing

In [None]:
ad_models = {}
for ad in df_log.Ad.unique():
    X = df_log[df_log.Ad == ad].drop(['Hour', 'Ad', 'Clicked'], axis=1)
    y = df_log[df_log.Ad == ad]['Clicked']
    
    model = LogisticRegression(penalty='l2')
    ad_models[ad] = model.fit(X, y)
print ad_models

In [None]:
for ad, clf in ad_models.iteritems():
    X = df_log[df_log.Ad == ad].drop(['Hour', 'Ad', 'Clicked'], axis=1)
    y = df_log[df_log.Ad == ad]['Clicked']

In [None]:
df_log.head(3)

In [None]:
df_log['Click_prob'] = np.zeros(len(df_log))
for ad in df_log.Ad.unique():
    X = df_log[df_log.Ad == ad].drop(['Hour', 'Ad', 'Clicked', 'Click_prob'], axis=1)
    pred_prob = ad_models[ad].predict_proba(X)[:,0].reshape(-1, 1)
    df_log[df_log.Ad == ad]['Click_prob'] = pred_prob


In [None]:
for row in df_lterrows():
    print row[1]
    
    

# Tree Models (Non-binary Features)

In [37]:
df_tree = pd.read_csv('training.csv', names=['Timestamp', 'Hour', 'Ad', 'Browser', 'Platform', 'Region', 'Clicked'])

In [38]:
# Create 'Hour' as a float.
# df_tree['Hour'] = df_tree['Timestamp'].apply(lambda x: round(float(x.split(':')[0][-2:]) + \
#     float(x.split(':')[1])/60, 2))
# df_tree.drop('Timestamp', axis=1, inplace=True)

In [39]:
# Create 'Hour' as a float rounded to the closed half hour.
df_tree['Hour'] = df_tree['Timestamp'].apply(lambda x: round((float(x.split(':')[0][-2:]) + \
    float(x.split(':')[1])/60)*2, 0)/2)

In [40]:
df_tree.drop('Timestamp', axis=1, inplace=True)

In [41]:
# Convert Catigorical strings to ints for modeling.
for col in ['Ad', 'Browser', 'Platform', 'Region']:
    catigories = list(df_tree[col].unique())
    df_tree[col] = df_tree[col].apply(lambda x: catigories.index(x))

In [42]:
df_tree.head()

Unnamed: 0,Hour,Ad,Browser,Platform,Region,Clicked
0,5.5,0,0,0,0,1
1,7.0,0,1,0,1,0
2,4.5,1,2,1,2,0
3,7.0,1,3,1,3,0
4,16.5,2,4,2,2,0


In [43]:
for n_estimators in range(1, 15):
    X = df_tree[['Hour', 'Region', 'Ad']]
    y = df_tree['Clicked']

    model = RandomForestClassifier(n_estimators=n_estimators)
    scores = cross_val_score(model, X, y, cv=5)

    print n_estimators, scores.mean()

NameError: name 'RandomForestClassifier' is not defined

In [None]:
train_scores = []
test_scores = []
n_estimators = range(5, 150, 2)

for n_est in n_estimators:
    X = df_tree.drop(['Clicked', 'Platform', 'Browser'], axis=1)
    y = df_tree['Clicked']

    model = GradientBoostingClassifier(n_estimators=n_est, max_depth=4)
    model.fit(X, y)
    train_scores.append(model.score(X, y))
    test_scores.append(cross_val_score(model, X, y, cv=5).mean())
    
    if n_est % 10.0 == 0: print n_est

In [None]:
plt.plot(n_estimators, train_scores, color='red')
plt.plot(n_estimators, test_scores, color='blue')
plt.show()

In [None]:
train_scores = []
test_scores = []
# depths = range(1, 25)

for depth in depths:
    X = df_tree.drop(['Clicked', 'Platform', 'Browser'], axis=1)
    y = df_tree['Clicked']

    model = GradientBoostingClassifier(n_estimators=40, max_depth=depth)
    model.fit(X, y)
    train_scores.append(model.score(X, y))
    test_scores.append(cross_val_score(model, X, y, cv=5).mean())
    
    print depth

In [None]:
plt.plot(depths, train_scores, color='red')
plt.plot(depths, test_scores, color='blue')
plt.show()

In [None]:
test_scores

In [47]:

X = df_tree.drop(['Clicked', 'Platform', 'Browser'], axis=1)
y = df_tree['Clicked']

model = GradientBoostingClassifier(n_estimators=37, max_depth=4)
# model.fit(X, y)
# train_scores.append(model.score(X, y))
test_scores.append(cross_val_score(model, X, y, cv=5).mean())

if n_est % 10.0 == 0: print n_est

NameError: name 'train_scores' is not defined

In [None]:
X = df_tree.drop(['Clicked'], axis=1)
y = df_tree['Clicked']

param_grid = [{'n_estimators': [40, 42, 44, 46, 48], 'max_depth': [4, 5, 6], 'max_features': [2]}]
clf = GridSearchCV(GradientBoostingClassifier(), param_grid, cv= 3, scoring=None, fit_params=None, n_jobs=-1)
clf.fit(X, y)

In [None]:
clf.best_score_

In [None]:
clf.best_estimator_

In [44]:
X = df_tree.drop(['Clicked'], axis=1)
y = df_tree['Clicked']

param_grid = [{'n_estimators': [37], 'max_depth': [5], 'max_features': [2],
              'min_samples_leaf':[1], 'min_samples_split':[2]}]
clf2 = GridSearchCV(GradientBoostingClassifier(), param_grid, cv= 3, scoring=None, fit_params=None, n_jobs=-1)
clf2.fit(X, y)

GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'n_estimators': [37], 'max_features': [2], 'min_samples_split': [2], 'max_depth': [5], 'min_samples_leaf': [1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [45]:
clf2.best_score_

0.70920000000000005

In [46]:
clf.best_estimator_

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
list(np.linspace(0, 5, 50))

[0.0,
 0.050505050505050504,
 0.10101010101010101,
 0.15151515151515152,
 0.20202020202020202,
 0.25252525252525254,
 0.30303030303030304,
 0.35353535353535354,
 0.40404040404040403,
 0.45454545454545453,
 0.50505050505050508,
 0.55555555555555558,
 0.60606060606060608,
 0.65656565656565657,
 0.70707070707070707,
 0.75757575757575757,
 0.80808080808080807,
 0.85858585858585856,
 0.90909090909090906,
 0.95959595959595956,
 1.0101010101010102,
 1.0606060606060606,
 1.1111111111111112,
 1.1616161616161615,
 1.2121212121212122,
 1.2626262626262625,
 1.3131313131313131,
 1.3636363636363635,
 1.4141414141414141,
 1.4646464646464645,
 1.5151515151515151,
 1.5656565656565655,
 1.6161616161616161,
 1.6666666666666667,
 1.7171717171717171,
 1.7676767676767677,
 1.8181818181818181,
 1.8686868686868687,
 1.9191919191919191,
 1.9696969696969697,
 2.0202020202020203,
 2.0707070707070705,
 2.1212121212121211,
 2.1717171717171717,
 2.2222222222222223,
 2.2727272727272725,
 2.3232323232323231,
 2.37373

In [25]:
[round(x,2) for x in list(np.linspace(0, 5, 10))]

[0.0, 0.56, 1.11, 1.67, 2.22, 2.78, 3.33, 3.89, 4.44, 5.0]

In [None]:
X = df_tree.drop(['Clicked', 'Platform', 'Browser'], axis=1)
y = df_tree['Clicked']

model = GradientBoostingClassifier(n_estimators=37, max_depth=4)
# model.fit(X, y)
# train_scores.append(model.score(X, y))
test_scores.append(cross_val_score(model, X, y, cv=5).mean())