In [1]:
#--Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfTransformer

#--global variables
cv = CountVectorizer()



In [2]:
def createModel(trainigSetPath):
    df = pd.read_csv(trainigSetPath, encoding="latin-1")
    #------------------------------------
    print("# of documents of the trainigSet: ",len(df))
    #------------------------------------
    x = df['message'].values
    y = df['caseName'].values
    x= cv.fit_transform(x) # Fit the Data
    #Multinomial Naive Bayes Classifier
    clf = MultinomialNB()
    clf.fit(x, y)
    
    # alpha -> Additive Laplace smoothing parameter
    # fit_prior ->Whether to learn class prior probabilities or not. If false, a uniform prior will be used
    # class_prior->Prior probabilities of the classes. If specified the priors are not adjusted according to the data.
    MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None) 
    #save model
    joblib.dump(clf, 'MNB_emailPrediction.pkl')

In [3]:
def createTfidf(trainigSetPath):
    df = pd.read_csv(trainigSetPath, encoding="latin-1")
    #------------------------------------
    print("# of documents of the trainigSet: ",len(df))
    #------------------------------------
    docs = df['message'].values
    labels = df['caseName'].values
    word_count_vector=cv.fit_transform(docs)
    word_count_vector.shape
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    # print idf values
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["tf_idf_weights"])
    # sort ascending
    df_idf.sort_values(by=['tf_idf_weights'])
   
    #print(df_idf)
    
    # count matrix
    count_vector=cv.transform(docs)
    # tf-idf scores
    tf_idf_vector=tfidf_transformer.transform(count_vector)
    feature_names = cv.get_feature_names()
 
    #get tfidf vector for first document
    first_document_vector=tf_idf_vector[0]

    #print the scores
    df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
    df.sort_values(by=["tfidf"],ascending=False)
    print(df)

In [4]:
def loadModel(modelPath, inputPath):
    
    NB_emailPrediction = open(modelPath,'rb')
    clf = joblib.load(NB_emailPrediction)
    input_df = pd.read_csv(inputPath, encoding="latin-1")
    #get input files and do a list
    messageList =list(input_df['message'].values)
    fromList =  list(input_df['from'].values)
    #use model 
    verifier_Wcount = cv.transform(messageList)
    predictions = clf.predict(verifier_Wcount)
    #generate output
    outputDict ={'from': fromList,'message':messageList,'predictions':predictions}
    outputDf = pd.DataFrame.from_dict(outputDict)
    outputDf.to_csv(path_or_buf="MLresult.csv", sep=',',index=True)
    
    #------------------------------------
    for i,j in zip(fromList,predictions):
        print(i, " ->",j)
    #------------------------------------

In [5]:
createModel('CasesToStudy.csv')

# of documents of the trainigSet:  10


In [6]:
#createTfidf('CasesToStudy.csv')

# of documents of the trainigSet:  10
                tfidf
03           0.000000
12           0.191620
1200109816   0.000000
123          0.191620
12331        0.000000
14           0.191620
15           0.162895
2019         0.000000
22           0.000000
2345151      0.191620
234573       0.000000
40           0.000000
43           0.000000
456          0.191620
61           0.191620
62           0.000000
71           0.191620
81           0.000000
8934         0.000000
address      0.000000
all          0.000000
amount       0.000000
any          0.000000
ap           0.000000
at           0.000000
bank         0.000000
be           0.000000
can          0.000000
check        0.000000
cleared      0.000000
...               ...
payables     0.000000
paying       0.000000
payment      0.000000
payments     0.000000
please       0.000000
send         0.000000
sending      0.000000
shown        0.000000
so           0.000000
spreadsheet  0.000000
systme       0.000000
that         0.0

In [7]:
#loadModel('MNB_emailPrediction.pkl','input.csv')

Luis	  -> p2p
Pedro	  -> Statements
A  -> Dispute case and W-9 request
B  -> Dispute case and W-9 request
C  -> Dispute case and W-9 request
D  -> Dispute case and W-9 request
E  -> Dispute case and W-9 request
F  -> p2p
G  -> Dispute case and W-9 request
