In [1]:
# Quake Forecast: Predict Time, Location and Magnitude of a Quake
import pandas as pd
import time
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import os
from calendar import Calendar


## 1. TrainingSet

In [2]:
def get_CSV_FileName(csvName, FolderName):
    cwd = os.getcwd()
    csvFullName = cwd + '/' + FolderName + '/' + csvName
    print(csvFullName)
    return csvFullName

In [3]:
TrainingDataFolder = 'TrainingDataSets'
csvTrainingFile = get_CSV_FileName('TrainingFeature.csv', TrainingDataFolder)

/Users/MH/SourceTreeproject/PredictSheetAction/Python_Prediction/eachUser/TrainingDataSets/TrainingFeature.csv


In [4]:
def get_TrainingDF(trainDataFrame, trainFeature):
    return trainDataFrame[trainFeature]

In [5]:
trainDF = pd.read_csv(csvTrainingFile, index_col=False )
feature = [  'query_date_year' , 'query_date_month'  ,'query_date_day', 'query_date_weekday'  ]
data = pd.get_dummies(get_TrainingDF(trainDF,feature))

In [6]:
# trainDF = pd.read_csv(csvTrainingFile, index_col=False )

## 2. Target Set

In [7]:
targetDoc = 'FMT_POS_190'
# targetDoc = 'report001'

csvTargetFileName = '_Target.csv'
csvTargetFile = get_CSV_FileName(targetDoc + csvTargetFileName, TrainingDataFolder)
targetDF = pd.read_csv(csvTargetFile, index_col=False )
target = get_TrainingDF(targetDF, targetDoc)
# target 

/Users/MH/SourceTreeproject/PredictSheetAction/Python_Prediction/eachUser/TrainingDataSets/FMT_POS_190_Target.csv


## 3. Check DataSets

In [8]:
def get_TrainingDataShape(trainDataFrame, targetingDataFrame):
    print('Fearture Data Size (X): ', trainDataFrame.shape, ', Target Data Size (Y): ', targetingDataFrame.shape)

In [9]:
get_TrainingDataShape(data, target)

Fearture Data Size (X):  (19206, 4) , Target Data Size (Y):  (19206,)


In [10]:
# data = data.reshape(-1,1)
# target = target.reshape(-1,1)

In [11]:
data.head(5)

Unnamed: 0,query_date_year,query_date_month,query_date_day,query_date_weekday
0,2017,2,10,5
1,2017,2,10,5
2,2017,2,10,5
3,2017,2,10,5
4,2017,2,10,5


In [12]:
target.head(5)

0    0
1    0
2    0
3    0
4    0
Name: FMT_POS_190, dtype: int64

In [13]:
featureHead = data.axes[1]
print(featureHead)
# targetIndex = featureHead.get_loc('doc_' + targetDoc)
# featureHead[targetIndex]

Index(['query_date_year', 'query_date_month', 'query_date_day',
       'query_date_weekday'],
      dtype='object')


## 4. Classify model

In [14]:
def set_Training_RandomForest(trainDataFrame, targetingDataFrame):
    X_train, X_test, y_train, y_test = train_test_split(trainDataFrame, targetingDataFrame, test_size=0.30, random_state=30)
    print('Training Data Size: ')
    get_TrainingDataShape(X_train, y_train)
    print('Test Data Size: ')
    get_TrainingDataShape(X_test, y_test)
    model = RandomForestClassifier(n_estimators=30)
    model.fit(X_train, y_train)
    print('Test Model Score: ',model.score(X_test, y_test) )
    return model

In [15]:
model  = set_Training_RandomForest(data, target)

Training Data Size: 
Fearture Data Size (X):  (13444, 4) , Target Data Size (Y):  (13444,)
Test Data Size: 
Fearture Data Size (X):  (5762, 4) , Target Data Size (Y):  (5762,)
Test Model Score:  0.969107948629


In [20]:
print("Click Rate: ",target.sum()/len(target)*100, "%")

Click Rate:  3.00947620535 %


In [21]:
def get_Date_YearMonthDay(target_year, target_month):
    month = Calendar().itermonthdates(target_year,target_month)
    allday =[day for day in month if day.month == target_month]
    return allday

In [34]:
# def get_PredictInput(predicDate, featureHead, targetDoc):
# #     shiftFeature = 4
#     pre_Data = [predicDate.year, predicDate.month, predicDate.day, predicDate.weekday()+1]
# #     targetIndex = featureHead.get_loc('doc_' + targetDoc)
# # #     print("Target Index: ", targetIndex)
# #     listofzeros = [0] * (len(featureHead) - shiftFeature)
# #     listofzeros[targetIndex - shiftFeature] = 1
#     PredictInput = pre_Data  # for username
#     return PredictInput
def get_PredictInput(predicDate):
    return [predicDate.year, predicDate.month, predicDate.day, predicDate.weekday()+1]

In [35]:
allday = get_Date_YearMonthDay(2017,5)
allday[0]

datetime.date(2017, 5, 1)

In [37]:
PredictInput = get_PredictInput(allday[0])
print("PredictInput: ",PredictInput)

PredictInput:  [2017, 5, 1, 1]


In [44]:
print("Predic Doc Name: " + targetDoc )
ClickDoc_Prob = 0.05

for predic_month in range(12):
    allday = get_Date_YearMonthDay(2017,predic_month+1)
    for day in allday:
        PredictInput = get_PredictInput(day)
        doc_predict = model.predict([PredictInput])
        doc_predict_proba = model.predict_proba([PredictInput])
#         if (doc_predict_proba[0][1] >= ClickDoc_Prob):
#             print(day," weekday: ", day.weekday()+1)
#             print("user feature: ", ('doc_' + targetDoc), " click prob: ", doc_predict_proba)
        if (doc_predict != 0):
            print(day," weekday: ", day.weekday()+1)
            print("user click: ")


Predic Doc Name: FMT_POS_190


In [33]:
model.predict_proba([PredictInput])

array([[ 0.98500714,  0.01499286]])

0

[2017, 1, 31, 2]

In [251]:
print("Predic Doc Name: " + targetDoc )
ClickDoc_Prob = 0.2

predic_month = 1 
month = Calendar().itermonthdates(2017,predic_month)
allday =[day for day in month if day.month == predic_month]

for day in allday:
    PredictInput = get_PredictInput(day,featureHead,targetDoc)
#     predicAll = 0
    doc_predict = model.predict(PredictInput)
    doc_predict_proba = model.predict_proba(PredictInput)
    if (doc_predict != 0):
        print(day," weekday: ", day.weekday()+1)
        print("user click: ")

Predic Doc Name: FMT_POS_190


