In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing,metrics
import sys

%matplotlib inline

In [64]:
#descriptions of features
!cat data/kaggle_data/features.txt

nb_words_title  Number of words in the article's titles
nb_words_content  Number of words in the article
pp_uniq_words  Proportion of unique words in the article
pp_stop_words  Proportion of stop words (i.e. words predefined to be too common to be of use for interpretation or queries, such as 'the', 'a', 'and', etc.)
pp_uniq_non-stop_words  Proportion of non-stop words among unique words
nb_links  Number of hyperlinks in the article
nb_outside_links  Number of hyperlinks pointing to another website
nb_images  Number of images in the article
nb_videos  Number of videos in the article
ave_word_length  Average word length
nb_keywords  Number of keywords in the metadata
category  Category of the article: 0-Lifestyle, 1-Entertainment, 2-Business, 3-Web, 4-Tech, 5-World
nb_mina_mink  Minimum number of share counts among all articles with at least one keyword in common with the article
nb_mina_maxk  Minimum number of maximum share counts per keyword
nb_mina_avek  Minimum number 

In [65]:
feature_data = pd.read_csv('data/kaggle_data/features.txt', header=None, sep="  ", names=['feature_names', 'feature_description'])
target_data = pd.read_csv('data/kaggle_data/train-targets.csv', sep=",")
target_data.head(5)
y_tr = target_data['Prediction'].values
list_feature_names = list(feature_data['feature_names'])
train_data = pd.read_csv('data/kaggle_data/train.csv', header=None, sep=" ", names=list_feature_names)
train_data.head(5)
test_data = pd.read_csv('data/kaggle_data/test-val.csv', header=None, sep=" ", names=list_feature_names)
test_data.head(5)

  """Entry point for launching an IPython kernel.


Unnamed: 0,nb_words_title,nb_words_content,pp_uniq_words,pp_stop_words,pp_uniq_non-stop_words,nb_links,nb_outside_links,nb_images,nb_videos,ave_word_length,...,pp_neg_words,pp_pos_words_in_nonneutral,ave_polar_pos,min_polar_pos,max_polar_pos,ave_polar_neg,min_polar_neg,max_polar_neg,subj_title,polar_title
0,12,258,0.5745,6.897e-09,0.6897,4.0,2,0,0,4,...,0.01653,0.7143,0.2967,0.1,1.0,-0.2344,-0.3,-0.1875,0.125,0.0
1,8,11,0.0,1.0,0.0,0.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.525,0.3
2,10,263,0.7249,6.623e-09,0.8543,6.0,3,2,0,5,...,0.04701,0.5,0.2617,0.1,1.0,-0.217,-0.5,-0.125,0.0,-0.2
3,13,1281,0.4067,1.422e-09,0.5903,29.0,4,1,1,4,...,0.01512,0.75,0.3585,0.03333,1.0,-0.2403,-0.5,-0.05,0.0,0.0
4,9,107,0.8152,1.538e-08,0.8154,5.0,2,0,0,4,...,0.02151,0.6667,0.4881,0.2857,1.0,-0.8,-1.0,-0.6,0.0,0.0


In [66]:
#Binarizing weekday data into working/non-working days 0-4 weekdays, 5 or 6 weekends
def binarize_weekends(dataframe):
    days = list(dataframe['weekday'])
    is_weekend = [0 if day in [0,1,2,3,4] else 1 for day in days]
    dataframe['weekday']=is_weekend
    return dataframe

#removing strongly correlated features, i.e. pp_uniq_words'<-> 'pp_uniq_non-stop_words'), ('nb_links'<-> 'nb_outside_links'), ('nb_mina_maxk'<-> 'nb_mina_avek')
#dropped_col_list=['pp_uniq_non-stop_words','nb_outside_links','nb_mina_avek']
dropped_col_list=[]

train_data = binarize_weekends(train_data)
test_data = binarize_weekends(test_data)
train_data = train_data.drop(dropped_col_list,axis=1)
test_data = test_data.drop(dropped_col_list,axis=1)
#print(test_data['weekday'])


In [67]:

# Get the weekday data and encode it using a dummy categorical encoding
"""
weekday_data = pd.get_dummies(train_data['weekday'], prefix='weekday', drop_first=True)
other_data = train_data.drop(['weekday'], axis=1)
train_data = pd.concat([weekday_data, other_data], axis=1)

weekday_data = pd.get_dummies(test_data['weekday'], prefix='weekday', drop_first=True)
other_data = test_data.drop(['weekday'], axis=1)
test_data = pd.concat([weekday_data, other_data], axis=1)

"""
category_data = pd.get_dummies(train_data['category'], prefix='cat', drop_first=True)
other_data = train_data.drop(['category'], axis=1)
train_data = pd.concat([category_data, other_data], axis=1)


category_data = pd.get_dummies(test_data['category'], prefix='cat', drop_first=True)
other_data = test_data.drop(['category'], axis=1)
test_data = pd.concat([category_data, other_data], axis=1)


In [68]:
train_data.head(5)

Unnamed: 0,cat_1,cat_2,cat_3,cat_4,cat_5,nb_words_title,nb_words_content,pp_uniq_words,pp_stop_words,pp_uniq_non-stop_words,...,pp_neg_words,pp_pos_words_in_nonneutral,ave_polar_pos,min_polar_pos,max_polar_pos,ave_polar_neg,min_polar_neg,max_polar_neg,subj_title,polar_title
2000,1,0,0,0,0,9,843,0.5358,2.092e-09,0.7469,...,0.01923,0.7143,0.4437,0.03333,1.0,-0.316,-0.8,-0.05,0.0,0.0
2001,0,0,0,0,1,9,805,0.4196,2.165e-09,0.5693,...,0.02571,0.5349,0.3081,0.05,0.8,-0.3463,-0.7143,-0.1,0.9,0.3
2002,0,0,0,1,0,8,145,0.7594,1.163e-08,0.8488,...,0.007519,0.8333,0.3673,0.1364,0.5,-0.2,-0.2,-0.2,0.0,0.0
2003,0,0,0,1,0,12,201,0.6359,9.259e-09,0.8148,...,0.02703,0.7368,0.3721,0.1364,0.6,-0.4,-0.4,-0.4,0.0,0.0
2004,0,0,0,0,1,13,673,0.4609,2.5e-09,0.595,...,0.02144,0.5625,0.35,0.05,0.6,-0.2435,-0.8,-0.1,0.0,0.0


In [69]:
# Cross-validation procedure, with standardization
def cross_validate_regr_with_scaling(design_matrix, labels, regressor, cv_folds):
    """ Perform a cross-validation and returns the predictions. 
    Use a scaler to scale the features to mean 0, standard deviation 1.
    
    Parameters:
    -----------
    design_matrix: (n_samples, n_features) np.array
        Design matrix for the experiment.
    labels: (n_samples, ) np.array
        Vector of labels.
    classifier:  Regressor instance; must have the following methods:
        - fit(X, y) to train the regressor on the data X, y
        - predict_proba(X) to apply the trained regressor to the data X and return predicted values
    cv_folds: sklearn cross-validation object
        Cross-validation iterator.
        
    Return:
    -------
    pred: (n_samples, ) np.array
        Vectors of predictions (same order as labels).
    """
    
    pred = np.zeros(labels.shape)
    pca = decomposition.PCA(n_components=30)
    scaler = preprocessing.StandardScaler()
    pca.fit(design_matrix)
    for tr, te in cv_folds:
        Xtr = scaler.fit_transform(design_matrix[tr,:])
        Xtr = pca.transform(Xtr)
        ytr =  labels[tr]
        Xte = scaler.transform(design_matrix[te,:])
        Xte = pca.transform(Xte)
        regressor.fit(Xtr, ytr)
        pred[te] = (regressor.predict(Xte))    
    return pred

In [70]:
#create folds
from sklearn import cross_validation

X_tr = train_data.values
X_te = test_data.values
print(X_tr.shape)
folds_regr = cross_validation.StratifiedKFold(y_tr,n_folds=10,shuffle=True)


(5000, 47)




In [71]:
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model
param_grid = {'alpha': np.logspace(-2, 4, 10)}
"""
regr_lasso_stand = linear_model.LinearRegression(fit_intercept=True)
ypred_lasso_stand = cross_validate_regr_with_scaling(X_tr, y_tr, regr_lasso_stand, folds_regr)
ypred_lasso_stand = np.where(ypred_lasso_stand>0,ypred_lasso_stand,0)
print(metrics.mean_squared_log_error(y_tr,ypred_lasso_stand))
#print(regr_lasso_stand.predict(X_te))
#print("with standardization; RMSE: ", np.sqrt(metrics.mean_squared_error(y_te,
#                                                                      ypred_lasso_stand)))

np.random.seed(5)
regr_ridge_stand_opt = GridSearchCV(linear_model.Ridge(), param_grid, scoring='neg_mean_squared_error')
ypred_ridge_stand_opt = cross_validate_regr_with_scaling(X_tr,y_tr,regr_ridge_stand_opt,folds_regr)
ypred_ridge_stand_opt = np.where(ypred_ridge_stand_opt>0,ypred_ridge_stand_opt,0)
print("with scaling:", metrics.mean_squared_log_error(y_tr,ypred_ridge_stand_opt), 'alpha: ', regr_lasso_stand_opt.best_params_['alpha'])
"""
np.random.seed(5)
regr_lasso_stand_opt = GridSearchCV(linear_model.Lasso(), param_grid,scoring='neg_mean_squared_error')
ypred_lasso_stand_opt = cross_validate_regr_with_scaling(X_tr, y_tr, regr_lasso_stand_opt, folds_regr)
ypred_lasso_stand_opt = np.where(ypred_lasso_stand_opt>0,ypred_lasso_stand_opt,0)
print("with standardization; RMSE: ", np.sqrt(metrics.mean_squared_log_error(y_tr,
                                                                       ypred_lasso_stand_opt)),'; alpha: ', regr_lasso_stand_opt.best_params_['alpha'])

with standardization; RMSE:  1.05745766251 ; alpha:  100.0


In [72]:
from sklearn import decomposition

std_scale = preprocessing.StandardScaler().fit(X_tr)
X_scaled = std_scale.transform(X_tr)

pca = decomposition.PCA(n_components=52)
pca.fit(X_scaled)
X_projected = pca.transform(X_scaled)
print(pca.explained_variance_ratio_)
plt.bar(np.arange(52), pca.explained_variance_ratio_, color='blue')
plt.xlim([-1, 52])
plt.xlabel("Number of PCs", fontsize=16)
plt.ylabel("Fraction of variance explained", fontsize=16)

ValueError: n_components=52 must be between 0 and n_features=47 with svd_solver='full'

In [None]:
print(max(ypred_lasso_stand_opt))

In [None]:
print(max(y_tr))

In [None]:
pca = decomposition.PCA(n_components=30)
scaler = preprocessing.StandardScaler()
pca.fit(X_tr)
pred = regr_lasso_stand_opt.predict(pca.transform(scaler.fit_transform(X_te)))
print(pred.shape)
pred_int = list(map(int, np.where(pred>0,pred,0)))
print(max(np.array(pred_int)))
print(min(np.array(pred_int)))
pred_df = pd.DataFrame()
pred_df['Prediction'] = pred_int
pred_df.head(5)

In [None]:
pred_df.to_csv('pred.csv')

In [73]:
import math
from sklearn import neighbors
from sklearn import model_selection

#each classis defined as the multiples of 100
def roundup(x):
    return int(math.ceil(x / 100.0)) * 100

def classify_labels(labels):
    return [roundup(x)/100 for x in labels]
    
#y_tr = classify_labels(y_tr)

#making knn predictions
classifier = neighbors.KNeighborsClassifier()
param_grid = {'n_neighbors':range(1,40,2) }
print(param_grid)
clf_knn_opt = model_selection.GridSearchCV(classifier,  param_grid=param_grid, cv=folds_regr)
clf_knn_opt.fit(X_tr,y_tr)
print(clf_knn_opt.best_params_)

{'n_neighbors': range(1, 40, 2)}
{'n_neighbors': 39}


In [86]:



def cross_validate(design_matrix, labels, regressor, cv_folds):
    """ Perform a cross-validation and returns the predictions. 
    Use a scaler to scale the features to mean 0, standard deviation 1.
    
    Parameters:
    -----------
    design_matrix: (n_samples, n_features) np.array
        Design matrix for the experiment.
    labels: (n_samples, ) np.array
        Vector of labels.
    classifier:  Regressor instance; must have the following methods:
        - fit(X, y) to train the regressor on the data X, y
        - predict_proba(X) to apply the trained regressor to the data X and return predicted values
    cv_folds: sklearn cross-validation object
        Cross-validation iterator.
        
    Return:
    -------
    pred: (n_samples, ) np.array
        Vectors of predictions (same order as labels).
    """
    #labels = np.array(labels)
    n_classes = len(clf_knn_opt.classes_)
    pred = np.zeros((labels.shape[0],n_classes))
    print(n_classes)
    for tr, te in cv_folds:
        scaler = preprocessing.StandardScaler()
        Xtr = scaler.fit_transform(design_matrix[tr,:])
        ytr =  labels[tr]
        Xte = scaler.transform(design_matrix[te,:])
        #regressor.fit(Xtr, ytr)
        pred[te, :] = regressor.predict(Xte)
    return pred


ypred_clf_knn_opt = cross_validate(X_tr,y_tr,clf_knn_opt.best_estimator_,folds_regr)
print(ypred_clf_knn_opt.shape)
print(metrics.mean_squared_log_error(y_tr,ypred_clf_knn_opt))
"""
fpr_clf_knn_opt, tpr_clf_knn_opt, thresh = metrics.roc_curve(y_tr,ypred_clf_knn_opt)
knn_h,        = plt.plot(fpr_clf_knn_opt, tpr_clf_knn_opt, 'r-')
knn_auc       = metrics.auc(fpr_clf_knn_opt, tpr_clf_knn_opt)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
"""

1111


ValueError: shape mismatch: value array of shape (560,) could not be broadcast to indexing result of shape (560,1111)

In [82]:
test=clf_knn_opt.predict(X_tr)
print(len(clf_knn_opt.classes_))

1111


In [None]:
print(max(test))