In [2]:
# Quake Forecast: Predict Time, Location and Magnitude of a Quake
import pandas as pd
import time
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import os
from calendar import Calendar


## 1. TrainingSet

In [116]:
def get_CSV_FileName(csvName, FolderName):
    cwd = os.getcwd()
    csvFullName = cwd + '/' + FolderName + '/' + csvName
    print(csvFullName)
    return csvFullName

In [117]:
TrainingDataFolder = 'TrainingDataSets'
csvTrainingFile = get_CSV_FileName('TrainingFeature.csv', TrainingDataFolder)

/Users/MH/SourceTreeproject/PredictSheetAction/Python_Prediction/eachUser/TrainingDataSets/TrainingFeature.csv


In [118]:
def get_TrainingDF(trainDataFrame, trainFeature):
    return trainDataFrame[trainFeature]

In [119]:
trainDF = pd.read_csv(csvTrainingFile, index_col=False )
feature = ['username',  'query_date_year' , 'query_date_month'  ,'query_date_day', 'query_date_weekday'  ]
data = pd.get_dummies(get_TrainingDF(trainDF,feature))
# data

## 2. Target Set

In [120]:
targetDoc = 'FMT_POS_190'
# targetDoc = 'report001'

csvTargetFileName = '_Target.csv'
csvTargetFile = get_CSV_FileName(targetDoc + csvTargetFileName, TrainingDataFolder)
targetDF = pd.read_csv(csvTargetFile, index_col=False )
target = get_TrainingDF(targetDF, targetDoc)
# target 

/Users/MH/SourceTreeproject/PredictSheetAction/Python_Prediction/eachUser/TrainingDataSets/FMT_POS_190_Target.csv


## 3. Check DataSets

In [135]:
def get_TrainingDataShape(trainDataFrame, targetingDataFrame):
    print('Fearture Data Size (X): ', trainDataFrame.shape, ', Targeting Data Size (Y): ', targetingDataFrame.shape)

In [122]:
get_TrainingDataShape(data, target)

Training Data Size:  (19206, 5) , Targeting Data Size: (19206,)


In [123]:
data.head(5)

Unnamed: 0,query_date_year,query_date_month,query_date_day,query_date_weekday,username_mPOS
0,2017,2,10,5,1
1,2017,2,10,5,1
2,2017,2,10,5,1
3,2017,2,10,5,1
4,2017,2,10,5,1


In [124]:
target.head(5)

0    0
1    0
2    0
3    0
4    0
Name: FMT_POS_190, dtype: int64

In [126]:
featureHead = data.axes[1]
# print(featureHead)

Index(['query_date_year', 'query_date_month', 'query_date_day',
       'query_date_weekday', 'username_mPOS'],
      dtype='object')


## 4. Classify model

In [138]:
def set_Training_RandomForest(trainDataFrame, targetingDataFrame):
    X_train, X_test, y_train, y_test = train_test_split(trainDataFrame, targetingDataFrame, test_size=0.30, random_state=42)
    print('Training Data Size: ')
    get_TrainingDataShape(X_train, y_train)
    print('Test Data Size: ')
    get_TrainingDataShape(X_test, y_test)
    model = RandomForestClassifier(n_estimators=20)
    model.fit(X_train, y_train)
    print('Test Model Score: ',model.score(X_test, y_test) )
    return model

In [139]:
model  = set_Training_RandomForest(data, target)

Training Data Size: 
Fearture Data Size (X):  (13444, 5) , Targeting Data Size (Y):  (13444,)
Test Data Size: 
Fearture Data Size (X):  (5762, 5) , Targeting Data Size (Y):  (5762,)
Test Model Score:  0.972231863936


In [141]:
def get_Date_YearAndMonth(target_year, target_month):
    month = Calendar().itermonthdates(target_year,target_month)
    allday =[day for day in month if day.month == target_month]
    return allday

In [145]:
allday = get_Date_YearAndMonth(2017,3)

In [146]:
print("Predic Doc Name: " + targetDoc )
ClickDoc_Prob = 0.2
for predic_month in range(12):
    predic_month = predic_month + 1 
    month = Calendar().itermonthdates(2017,predic_month)
    allday =[day for day in month if day.month == predic_month]

    for day in allday:
        query_date_year = 2017
        query_date_month = day.month
        query_date_day = day.day
        query_date_weekday = day.weekday()+1
        pre_Data = [query_date_year,query_date_month,query_date_day,query_date_weekday]
        
        predicAll = 0
  
        predicData = pre_Data + [1]
        doc_predict = model.predict([predicData])
        doc_predict_proba = model.predict_proba([predicData])
        if (doc_predict_proba[0][1] >= ClickDoc_Prob):
            print(day," weekday: ", query_date_weekday)
            print("user feature: ", featureHead[4], " click prob: ", doc_predict_proba)
        if (doc_predict != 0):
            print("user click: ", featureHead[i+4])
            predicAll = predicAll + doc_predict
    #     print("all user click counter: ",predicAll, "\n")


Predic Doc Name: FMT_POS_190
2017-01-02  weekday:  1
user feature:  username_mPOS  click prob:  [[ 0.78897209  0.21102791]]
2017-02-06  weekday:  1
user feature:  username_mPOS  click prob:  [[ 0.75230542  0.24769458]]
2017-02-22  weekday:  3
user feature:  username_mPOS  click prob:  [[ 0.79131463  0.20868537]]
2017-03-06  weekday:  1
user feature:  username_mPOS  click prob:  [[ 0.70931823  0.29068177]]
2017-03-09  weekday:  4
user feature:  username_mPOS  click prob:  [[ 0.76952567  0.23047433]]
2017-04-11  weekday:  2
user feature:  username_mPOS  click prob:  [[ 0.7840344  0.2159656]]
2017-04-12  weekday:  3
user feature:  username_mPOS  click prob:  [[ 0.59829363  0.40170637]]
2017-06-14  weekday:  3
user feature:  username_mPOS  click prob:  [[ 0.73139286  0.26860714]]
2017-07-11  weekday:  2
user feature:  username_mPOS  click prob:  [[ 0.7840344  0.2159656]]
2017-07-12  weekday:  3
user feature:  username_mPOS  click prob:  [[ 0.59829363  0.40170637]]
2017-09-12  weekday:  2
u

In [60]:
model.predict_proba([predicData])

array([[ 0.96961538,  0.03038462]])