## Paths and constants

In [None]:
P_a_comments = r'data/raw/stackoverflow_amazon_a_comment.csv'
P_q_comments = r'data/raw/stackoverflow_amazon_q_comment.csv'
P_answers = r'data/raw/stackoverflow_amazon_a.csv'

# Original outdated comments file path.
# P_outdated_a_comments = r'data/shared/stackoverflow_amazon_a_comment_outdated.csv'
# P_outdated_q_comments = r'data/shared/stackoverflow_amazon_a_comment_outdated.csv'

# Output paths
P_potential_outdated_comments = r'../data/pipeline/outdated_comments_by_keywords.csv'

A_comment_columns = ["Id", "PostId", "Score", "Text",
                     "CreationDate", "UserId","ContentLicense"]
A_answer_columns = ['Id', 'PostTypeId', 'ParentId', 'CreationDate',
    'Score', 'Body','OwnerUserId', 'OwnerDisplayName', 'LastActivityDate', 
    'CommentCount', 'ContentLicense']

## Load data

In [None]:
import os
import pandas as pd

def load_data(path: str) -> pd.DataFrame:
    """Load comments data."""
    comment_dtypes = {col: 'string' for col in A_comment_columns}
    comment_dtypes['ContentLicense'] = 'category'
    comment_dtypes['Score'] = 'int'
    answer_dtypes = {col: 'string' for col in A_answer_columns}
    answer_dtypes['ContentLicense'] = 'category'
    answer_dtypes['Score'] = 'int'

    if 'comment' in path:
        columns = A_comment_columns
        d_types = comment_dtypes
    else:
        columns = A_answer_columns
        d_types = answer_dtypes

    if 'data' not in os.listdir('.'):
        path = '../' + path

    df = pd.read_csv(path, error_bad_lines=False, dtype=d_types)

    return df[columns]

## Extract outdated ones by keywords

In [None]:
import pandas as pd

def filter_by_keywords(df: pd.DataFrame, content_colname: str) -> pd.DataFrame:
    """Extract the rows where column 'Text' / 'Body' contains keywords."""
    A_outdated_keywords = [' outdated', ' deprecated', ' obsolete',
                        ' discouraged', ' out of date']
    def filter_text_with_keywords(row):
        cond = (kw in row[content_colname] for kw in A_outdated_keywords)
        return any(cond)
    return df[df.apply(filter_text_with_keywords, axis=1)]


In [None]:
def get_outdated_comments() -> pd.DataFrame:
    """Get the potentially outdated comments"""
    comments_a = load_data(P_a_comments)
    comments_q = load_data(P_q_comments)
    outdated_comments_a = filter_by_keywords(comments_a, 'Text')
    outdated_comments_q = filter_by_keywords(comments_q, 'Text')
    outdated_comments_a['of_answer'] = True
    outdated_comments_q['of_answer'] = False
    return pd.concat([outdated_comments_q, outdated_comments_a]).reset_index()

In [None]:
if __name__ == '__main__':
    df_comments = get_outdated_comments()
    df_comments.to_csv(P_potential_outdated_comments, index=False)
    