In [70]:
#import all packages
import pandas as pd
import numpy as np
import fasttext
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

import plotly
import plotly.graph_objects as go

In [2]:
def get_train_test_set(data_path):
    '''
    get text and label from manual checking data and create train/test sets
    '''
    df=pd.read_csv(data_path, lineterminator='\n')
    all_data=df[df['outdated (manually checked)'].isnull()==False]
    all_text_data=all_data[['Text','outdated (manually checked)']].copy()
    X=list(all_text_data['Text'])
    y=list(all_text_data['outdated (manually checked)'])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [3]:
X_train, X_test, y_train, y_test = get_train_test_set('./balanced_data.csv')

In [4]:
data_path='./Amazon_capstone_project/data/pipeline/952_rows_40_columns.csv'
X_train, X_test, y_train, y_test=get_train_test_set(data_path)

In [24]:
def combine_data():
    data_comment = pd.read_csv('./Amazon_capstone_project/data/shared/commentTestData.csv', error_bad_lines=False, sep='\t')
    data_answer = pd.read_csv('./Amazon_capstone_project/data/shared/AnswerTestData_1.csv', encoding="ISO-8859-1")
    comment_text=list(data_comment.Text)
    comment_label=list(data_comment['outdated (manually checked)'])
    comment_id = list(data_comment.Id)
    answer_text=list(data_answer.Text)
    answer_label=list(data_answer['outdated (manually checked)'])
    answer_id = list(data_answer.Id)
    text=comment_text+answer_text
    label=comment_label+answer_label
    Id = comment_id+answer_id
    posts_df = pd.DataFrame(
        {'label': label,
         'text': text
        })
    return text, label, Id

In [25]:
text, label, Id = combine_data()
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=42)

## Model

In [20]:
def NB_model(X_train, X_test, y_train, y_test):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    X_test_counts = count_vect.transform(X_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    clf = MultinomialNB().fit(X_train_tfidf, y_train)
    
    proba = clf.predict_proba(X_test_tfidf)
    accuracy = clf.score(X_test_tfidf, y_test)
    f1score = f1_score(y_test, clf.predict(X_test_tfidf))
    return accuracy, f1score, proba

In [8]:
def LogisticRegression_model(X_train, X_test, y_train, y_test):
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    X_test_counts = count_vect.transform(X_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
    
    clfLR = LogisticRegression(C=20).fit(X_train_tfidf, y_train)
    parameters = {'C':[1, 5, 20, 50, 100, 120, 150]}
    LRGS = GridSearchCV(clfLR, parameters)
    LRGS.fit(X_train_tfidf, y_train)
    
    proba = LRGS.predict_proba(X_test_tfidf)
    accuracy = LRGS.score(X_test_tfidf, y_test)
    f1score = f1_score(y_test, LRGS.predict(X_test_tfidf))
    return accuracy, f1score, proba

In [12]:
def fasttext_model(X_train, X_test, y_train, y_test):
    X_train=[s.replace('\n',' ').replace('\t',' ') for s in X_train]
    X_test=[s.replace('\n',' ').replace('\t',' ') for s in X_test]
    y_train=['__label__'+str(s) for s in y_train]
    y_test=['__label__'+str(s) for s in y_test]
    train_df = pd.DataFrame({'label': y_train,'text': X_train})
    test_df = pd.DataFrame({'label': y_test,'text': X_test})
    train_df.to_csv('./Amazon_capstone_project/data/shared/processed_posts_train.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    test_df.to_csv('./Amazon_capstone_project/data/shared/processed_posts_test.txt', index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
    model = fasttext.train_supervised(input='./Amazon_capstone_project/data/shared/processed_posts_train.txt')
    result=model.test('./Amazon_capstone_project/data/shared/processed_posts_test.txt')
    
    predictions = []
    prediction_boolean = []
    for text in X_test:
        prediction = model.predict(text)
        predictions.append(float(prediction[1]))
       # prediction_boolean.append(str(prediction[0][0][9:13]))
    return result[1], predictions

In [33]:
accuracy_nb, f1_nb, proba_nb = NB_model(X_train, X_test, y_train, y_test)

In [10]:
accuracy_lr, f1_lr, proba_lr = LogisticRegression_model(X_train, X_test, y_train, y_test)

In [13]:
accuracy_ft, proba_ft = fasttext_model(X_train, X_test, y_train, y_test)

In [17]:
# metric table
fig = go.Figure(data=[go.Table(header=dict(values=[' ', 'Naive Bayes', 'Logistic Regression', 'Fasttext']),
                 cells=dict(values=[['Accuracy', 'F1_score'], [accuracy_nb, f1_nb], [accuracy_lr, f1_lr], [accuracy_ft, 0]]))
                     ])
fig.show()

In [80]:
#append model score to the original dataset
def add_model_score(data_path):
    output_df = pd.read_csv(data_path)
    model_score = []
    for i in range(len(output_df)):
        for j in range(len(X_test)):
            if output_df['Text'][i] == X_test[j]:
                model_score.append(i)
                model_score.append(proba_lr[j][1])
    model_score_2 = []
    for i in range(len(output_df)):
        if i in model_score:
            model_score_2.append(model_score[model_score.index(i)+1])
        else:
            model_score_2.append(1)
   
    output_df['model_score'] = model_score_2
    output_df.to_csv('./952_rows_41_columns_with_modelScores.csv')

In [79]:
data_path='./Amazon_capstone_project/data/pipeline/952_rows_40_columns.csv'
add_model_score(data_path)