# IMPORTS

## Libraries

In [11]:
import pandas as pd
import numpy  as np

import matplotlib.pyplot as plt

from scipy.sparse              import hstack
from sklearn.metrics           import roc_auc_score, average_precision_score
from sklearn.ensemble          import RandomForestClassifier


np.random.seed(0)
%matplotlib inline
pd.set_option('display.max_columns', 200)

# Load Data

In [16]:
dfRaw1 = pd.read_feather('../Data/FeatherData/dfTrainGS.feather')
dfRaw2 = pd.read_feather('../Data/FeatherData/dfActiveLearningGS.feather')

In [17]:
dfRaw1 = dfRaw1.sort_values('UploadDate')
dfRaw2 = dfRaw2.sort_values('UploadDate')

dfRaw1['WatchList'] = dfRaw1['WatchList'].astype(int)
dfRaw2['WatchList'] = dfRaw2['WatchList'].astype(int)

# Active Learning Results

In [18]:
dfRaw2['New'] = 1

In [19]:
dfRaw2.shape

(300, 20)

In [20]:
average_precision_score(dfRaw2['WatchList'], dfRaw2['p']), roc_auc_score(dfRaw2['WatchList'], dfRaw2['p'])

(0.3773090988870055, 0.7380969125817568)

In [21]:
dfTrain = pd.concat([dfRaw1, dfRaw2.drop('p', axis=1)])

In [22]:
dfTrain['New'] = dfTrain['New'].fillna(0)

dfFeatures = pd.DataFrame(index=dfTrain.index)
dfFeatures['ViewCount'] = dfTrain['ViewCount']
dfFeatures['DaysSincePublication'] = dfTrain['DaysSincePublication']
dfFeatures['WatchList'] = dfTrain['WatchList']
dfFeatures['ViewsPerDay'] = dfFeatures['ViewCount'] / dfFeatures['DaysSincePublication']
dfFeatures = dfFeatures.drop('DaysSincePublication', axis=1)

# Split DataFrame into Training and Validation Dataset

In [36]:
maskTrain = (dfTrain['UploadDate'] < '2019-10-23')

maskVal = (dfTrain['UploadDate'] >= '2019-10-23') & (dfTrain['New'] == 0)

Xtrain, Xval = dfFeatures[maskTrain].drop('WatchList', axis=1) , dfFeatures[maskVal].drop('WatchList', axis=1)
ytrain, yval = dfFeatures[maskTrain]['WatchList'], dfFeatures[maskVal]['WatchList']

In [37]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((849, 2), (785, 2), (849,), (785,))

# Text Features

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

titleTrain = dfTrain[maskTrain]['Title']
titleVal = dfTrain[maskVal]['Title']

titleVec = TfidfVectorizer(min_df=2)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

In [39]:
titleBowTrain.shape

(849, 615)

In [40]:
titleBowTrain

<849x615 sparse matrix of type '<class 'numpy.float64'>'
	with 6462 stored elements in Compressed Sparse Row format>

In [41]:
XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

In [42]:
XtrainWTitle.shape, XvalWTitle.shape

((849, 617), (785, 617))

# Model

## DecisionTreeClassifier

In [43]:
model = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=-1)
model.fit(XtrainWTitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=-1,
                       random_state=0)

In [44]:
p = model.predict_proba(XvalWTitle)[:,1]

## Model Evaluate 

In [45]:
average_precision_score(yval,p)

0.4409772530463286

In [46]:
roc_auc_score(yval, p)

0.7967991076471252

# Convert Dataset to .feather

In [48]:
dfTrain = dfTrain.reset_index(drop=True)

In [47]:
dfTrain.to_feather('../Data/FeatherData/dfAllDataLabeled.feather')

ValueError: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s)