In [1]:
import pandas as pd

nyt_train = pd.read_csv('NYTimesBlogTrain.csv')
nyt_test = pd.read_csv('NYTimesBlogTest.csv')

nyt_full = pd.concat((nyt_train,nyt_test))
nyt_full.head()

Unnamed: 0,Abstract,Headline,NewsDesk,Popular,PubDate,SectionName,Snippet,SubsectionName,UniqueID,WordCount
0,A puzzle from Ethan Cooper that reminds me tha...,More School Daze,Business,1.0,2014-09-01 22:00:09,Crosswords/Games,A puzzle from Ethan Cooper that reminds me tha...,,1,508
1,The Strange Library will arrive just three and...,New 96-Page Murakami Work Coming in December,Culture,0.0,2014-09-01 21:14:07,Arts,The Strange Library will arrive just three and...,,2,285
2,Public pension funds have major stakes in Amer...,Public Pension Funds Stay Mum on Corporate Expats,Business,0.0,2014-09-01 21:05:36,Business Day,Public pension funds have major stakes in Amer...,Dealbook,3,1211
3,As they struggle to find new business to bolst...,Boot Camp for Bankers,Business,1.0,2014-09-01 20:43:34,Business Day,As they struggle to find new business to bolst...,Dealbook,4,1405
4,Middle-aged and older patients are unlikely to...,Of Little Help to Older Knees,Science,1.0,2014-09-01 18:58:51,Health,Middle-aged and older patients are unlikely to...,,5,181


In [2]:
#Adding frequency of headline as a feature
nyt_full["HeadlineFreq"] = nyt_full.Headline.map(nyt_full.Headline.value_counts())

#Adding number of words in the headline as a feature
nyt_full['HeadlineWords'] = nyt_full.Headline.apply(lambda x: len(str(x).split(' ')))

#Adding number of words in the abstract as a feature
nyt_full['AbstractWords'] = nyt_full.Abstract.apply(lambda x: len(str(x).split(' ')))

#Adding number of question/exclamation mark as a feature
nyt_full['new2']=nyt_full[['Headline']].applymap(lambda x: str.count(x, '?'))
nyt_full['new3']=nyt_full[['Headline']].applymap(lambda x: str.count(x, '!'))
nyt_full['HeadlineProps']= nyt_full.new2 + nyt_full.new3


In [3]:
nyt2 = nyt_full[['NewsDesk','SectionName','SubsectionName','WordCount','HeadlineWords','AbstractWords','HeadlineFreq', 'HeadlineProps']]
nyt2.head()

Unnamed: 0,NewsDesk,SectionName,SubsectionName,WordCount,HeadlineWords,AbstractWords,HeadlineFreq,HeadlineProps
0,Business,Crosswords/Games,,508,3,13,1,0
1,Culture,Arts,,285,7,24,1,0
2,Business,Business Day,Dealbook,1211,8,31,1,0
3,Business,Business Day,Dealbook,1405,4,23,1,0
4,Science,Health,,181,6,33,1,0


In [4]:
import numpy as np

#Function to map non-numerical categorical data to its numerical counterparts

def handle_non_numerical_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals={}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x=0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1
                    
            df[column] = list(map(convert_to_int, df[column]))
    return df

nyt3 = handle_non_numerical_data(nyt2)
nyt3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,NewsDesk,SectionName,SubsectionName,WordCount,HeadlineWords,AbstractWords,HeadlineFreq,HeadlineProps
0,9,2,0,508,3,13,1,0
1,12,14,0,285,7,24,1,0
2,9,3,2,1211,8,31,1,0
3,9,3,2,1405,4,23,1,0
4,10,8,0,181,6,33,1,0


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
simple_train = nyt_full['Abstract']
simple_trainSM = vect.fit_transform(simple_train.fillna(" "))
vect.get_feature_names()

['000',
 '006',
 '010',
 '02',
 '03',
 '08',
 '09',
 '10',
 '100',
 '100000003149882',
 '100000003149892',
 '100th',
 '101',
 '105',
 '106',
 '107',
 '108',
 '10th',
 '10x',
 '11',
 '110',
 '111',
 '113th',
 '114',
 '114th',
 '115',
 '1159',
 '117',
 '11th',
 '12',
 '120',
 '122',
 '123',
 '125',
 '127',
 '129',
 '12th',
 '13',
 '130',
 '13th',
 '14',
 '140',
 '140th',
 '142',
 '147',
 '149',
 '14th',
 '15',
 '150',
 '150th',
 '159',
 '15th',
 '16',
 '160',
 '161',
 '164',
 '165th',
 '169',
 '16th',
 '17',
 '171',
 '174',
 '175',
 '17th',
 '18',
 '180',
 '1800',
 '1803',
 '1812',
 '1821',
 '1822',
 '1850',
 '1851',
 '1859',
 '186',
 '1860s',
 '1862',
 '1863',
 '1864',
 '187',
 '188',
 '1888',
 '1889',
 '1892',
 '18th',
 '19',
 '1909',
 '191',
 '1912',
 '1914',
 '1916',
 '1918',
 '1924',
 '1936',
 '1938',
 '1939',
 '194',
 '1940',
 '1943',
 '1944',
 '1948',
 '1950s',
 '1951',
 '1952',
 '1953',
 '1955',
 '1956',
 '1958',
 '1959',
 '196',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '19

In [6]:
from scipy.sparse import csr_matrix, hstack

nyt3SM= csr_matrix(nyt3.values)

SM = hstack((nyt3SM, simple_trainSM), format="csr")
SM

<8402x18674 sparse matrix of type '<class 'numpy.int64'>'
	with 209669 stored elements in Compressed Sparse Row format>

In [7]:
train_index = nyt_full[~nyt_full["Popular"].isnull()].index
train_index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            6522, 6523, 6524, 6525, 6526, 6527, 6528, 6529, 6530, 6531],
           dtype='int64', length=6532)

In [8]:
test_index = nyt_full[nyt_full["Popular"].isnull()].index
test_index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1860, 1861, 1862, 1863, 1864, 1865, 1866, 1867, 1868, 1869],
           dtype='int64', length=1870)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
y= nyt_train.Popular

param_grid = {'n_estimators': [200, 700],'max_features': ['auto', 'sqrt', 'log2']}

gr = GridSearchCV(rfc, param_grid, scoring="roc_auc", n_jobs=-1, cv=4)

gr.fit(SM[train_index, :], y)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [200, 700], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [10]:
gr.best_score_

0.92110432007358178

In [11]:
gr.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=700, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [12]:
probability = gr.best_estimator_.predict_proba(SM[test_index,:])[:,1]
probability

array([ 0.86714286,  0.02857143,  0.05857143, ...,  0.        ,
        0.01571429,  0.14571429])

In [13]:
results = nyt_test[["UniqueID"]].loc[test_index]
results["Probability"] = probability
results

Unnamed: 0,UniqueID,Probability
0,6533,0.867143
1,6534,0.028571
2,6535,0.058571
3,6536,0.722857
4,6537,0.738571
5,6538,0.754286
6,6539,0.084286
7,6540,0.805714
8,6541,0.770000
9,6542,0.059143


In [14]:
results.to_csv("results.csv", index=False)