# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy  as np

import matplotlib.pyplot as plt

from scipy.sparse              import hstack
from sklearn.metrics           import roc_auc_score, average_precision_score
from sklearn.ensemble          import RandomForestClassifier

from google.oauth2             import service_account
from googleapiclient.discovery import build


np.random.seed(0)
%matplotlib inline
pd.set_option('display.max_columns', 200)

# Load Data

In [None]:
dfTrain = pd.read_feather('../Data/FeatherData/dfTrainGS.feather')
dfTest = pd.read_feather('../Data/FeatherData/dfTestGS.feather')

In [None]:
dfTrain = dfTrain.sort_values('UploadDate')

# Generating some features to evaluate a simple model

In [None]:
dfFeatures = pd.DataFrame(index=dfTrain.index)

In [None]:
dfFeatures['ViewCount'] = dfTrain['ViewCount']
dfFeatures['DaysSincePublication'] = dfTrain['DaysSincePublication']
dfFeatures['WatchList'] = dfTrain['WatchList'].astype(int)
dfFeatures['ViewsPerDay'] = dfFeatures['ViewCount'] / dfFeatures['DaysSincePublication']
dfFeatures = dfFeatures.drop('DaysSincePublication', axis=1)

In [None]:
dfFeatures.head()

# Split DataFrame into Training and Validation Dataset

In [None]:
dfTrain['UploadDate'].value_counts().plot(figsize=(20, 10))

In [None]:
Xtrain, Xval = dfFeatures.iloc[:int(round(dfTrain.shape[0]/2,0))].drop('WatchList', axis=1), dfFeatures.iloc[int(round(dfTrain.shape[0]/2,0)):].drop('WatchList', axis=1)
ytrain, yval = dfFeatures['WatchList'].iloc[:int(round(dfTrain.shape[0]/2,0))], dfFeatures['WatchList'].iloc[int(round(dfTrain.shape[0]/2,0)):]

In [None]:
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

# Text Features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

titleTrain = dfTrain['Title'].iloc[:int(round(dfTrain.shape[0]/2,0))]
titleVal = dfTrain['Title'].iloc[int(round(dfTrain.shape[0]/2,0)):]

titleVec = TfidfVectorizer(min_df=2)
titleBowTrain = titleVec.fit_transform(titleTrain)
titleBowVal = titleVec.transform(titleVal)

In [None]:
titleBowTrain.shape

In [None]:
titleBowTrain

In [None]:
XtrainWTitle = hstack([Xtrain, titleBowTrain])
XvalWTitle = hstack([Xval, titleBowVal])

In [None]:
XtrainWTitle.shape, XvalWTitle.shape

# Model

## DecisionTreeClassifier

In [None]:
model = RandomForestClassifier(n_estimators=1000, random_state=0, class_weight='balanced', n_jobs=-1)
model.fit(XtrainWTitle, ytrain)

In [None]:
p = model.predict_proba(XvalWTitle)[:,1]

## Model Evaluate 

In [None]:
average_precision_score(yval,p)

In [None]:
roc_auc_score(yval, p)

# ACTIVE LEARNING

- 70 examples that the model has difficulty
- 30 random examples

In [None]:
dfTest.shape

In [None]:
dfUnlabeled = dfTest.sample(800)

In [None]:
dfUnlabeled.head()

## Create a New DataFrame for Unlabeled Data

In [None]:
dfUnlabeledFeatures = pd.DataFrame(index=dfUnlabeled.index)

In [None]:
dfUnlabeledFeatures['ViewCount'] = dfUnlabeled['ViewCount']
dfUnlabeledFeatures['DaysSincePublication'] = dfUnlabeled['DaysSincePublication']
dfUnlabeledFeatures['ViewsPerDay'] = dfUnlabeledFeatures['ViewCount'] / dfUnlabeledFeatures['DaysSincePublication']
dfUnlabeledFeatures = dfUnlabeledFeatures.drop('DaysSincePublication', axis=1)

## Text Features

In [None]:
XUnlabeled = dfUnlabeledFeatures.copy()

In [None]:
titleUnlabeled = dfUnlabeled['Title']
titleUnlabeledBow = titleVec.transform(titleUnlabeled)

In [None]:
XUnlabeledWTitle = hstack([XUnlabeled, titleUnlabeledBow])

In [None]:
XtrainWTitle

## Model Evaluate for Filter Hard Decisions

In [None]:
pu = model.predict_proba(XUnlabeledWTitle)[:,1]

In [None]:
dfUnlabeled['p'] = pu

### Filter Hard Decisions and Random Decisions

In [None]:
maskUnlabeled = (dfUnlabeled['p'] >= 0.38) & (dfUnlabeled['p'] <= 0.62)
maskUnlabeled.sum()

In [None]:
hardDecisionSample = dfUnlabeled[maskUnlabeled]
randomSample = dfUnlabeled[~maskUnlabeled].sample(31)

In [None]:
dfActiveLearning = pd.concat([hardDecisionSample, randomSample])

# Send to Google Sheets

In [None]:
dfActiveLearning['UploadDate'] = dfActiveLearning['UploadDate'].astype(str)
dfActiveLearning['WatchList'] = ''
dfActiveLearning = dfActiveLearning.values.tolist()

In [None]:
# Documentation: https://developers.google.com/sheets/api/quickstart/python
SERVICE_ACCOUNT_FILE = 'D:/01-DataScience/04-Projetos/00-Git/Youtube-Video-Recommendations/Credentials/keys.json'
SCOPES = ['https://www.googleapis.com/auth/spreadsheets']

credentials = None
credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES)


# The ID of spreadsheet.
SAMPLE_SPREADSHEET_ID = '1uCur7jOXuLnwuwfWgoBL8mvDDvchuLf-o0X-AnOxS7s'

service = build('sheets', 'v4', credentials=credentials)

# Call the Sheets API
sheet = service.spreadsheets()


# Write Values
request = sheet.values().update(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                               range="ActiveLearning!A2", valueInputOption="USER_ENTERED", body={"values":dfActiveLearning}).execute()