In [1]:
import json
import random
import re
import codecs
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import make_pipeline

In [2]:
with codecs.open("./pizza_request_dataset.json", 'r', 'utf-8') as myFile:
    content = myFile.read()
pizzaDataset = json.loads(content)
df = pd.DataFrame(pizzaDataset)
print(df.shape)

(5671, 33)


In [3]:
df['requester_received_pizza_int'] = [1 if x else 0 for x in df['requester_received_pizza']]
print(df.shape)

(5671, 34)


In [4]:
SEED=42
SIZE=0.1
y = df['requester_received_pizza'].astype(int)
STRATIFY = df['requester_received_pizza'].values

In [5]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer() 
def data_preprocessing(text_sentence):
    if not isinstance(text_sentence, str):
        return ""
    text_sentence = text_sentence.lower() 
    text_sentence = re.sub(r"https?://\S+|www\.\S+", "", text_sentence)
    text_sentence = re.sub(r"[^a-z\s'-]", "", text_sentence)
    tokens = re.findall(r"\b[a-z]+(?:['-][a-z]+)*\b", text_sentence) 
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)
df["cleaned_data"] = df["request_text_edit_aware"].apply(data_preprocessing)

In [6]:
def print_results(model_number, y_test,pred,prob, tn, fp):
    print( modelnumber, 'accuracy', accuracy_score(y_test, pred), 
      'precision', precision_score(y_test, pred),
      'recall', recall_score(y_test, pred),
      'F1', f1_score(y_test, pred),
      'specificity', tn / (tn + fp),
      'AUC', roc_auc_score(y_test, prob))

In [7]:
#Model_1
X = df['request_text'].fillna('')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE, random_state=SEED, stratify=STRATIFY)
unigram = CountVectorizer(ngram_range=(1, 1), max_features=500, lowercase=True, stop_words='english')
bigram = CountVectorizer(ngram_range=(2, 2), max_features=500, lowercase=True, stop_words='english')
vect = FeatureUnion([('unigram',unigram), ('bigram',bigram)])
X_train = vect.fit_transform(X_train)
X_test  = vect.transform(X_test)
model_1 = SVC(kernel='linear', probability=True, random_state=SEED).fit(X_train, y_train)
pred = model_1.predict(X_test)
prob = model_1.predict_proba(X_test)[:,1]
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [8]:
modelnumber='Model_1'
print_results(modelnumber, y_test,pred,prob, tn, fp)

Model_1 accuracy 0.7306338028169014 precision 0.410958904109589 recall 0.21428571428571427 F1 0.28169014084507044 specificity 0.8995327102803738 AUC 0.5884846461949266


In [9]:
#Model_2
activity = [
    'post_was_edited',
    'requester_account_age_in_days_at_request',
    'requester_account_age_in_days_at_retrieval',
    'requester_days_since_first_post_on_raop_at_request',
    'requester_days_since_first_post_on_raop_at_retrieval',
    'requester_number_of_comments_at_request',
    'requester_number_of_comments_at_retrieval',
    'requester_number_of_comments_in_raop_at_request',
    'requester_number_of_comments_in_raop_at_retrieval',
    'requester_number_of_posts_at_request',
    'requester_number_of_posts_at_retrieval',
    'requester_number_of_posts_on_raop_at_request',
    'requester_number_of_posts_on_raop_at_retrieval',
    'requester_number_of_subreddits_at_request',
]
reputation = [
    'number_of_downvotes_of_request_at_retrieval',
    'number_of_upvotes_of_request_at_retrieval',
    'requester_upvotes_minus_downvotes_at_request',
    'requester_upvotes_minus_downvotes_at_retrieval',
    'requester_upvotes_plus_downvotes_at_request',
    'requester_upvotes_plus_downvotes_at_retrieval',
]
features= activity+reputation
X = df[features].fillna(0).astype(float)
X['post_was_edited'] = X['post_was_edited'].astype(int)
X = X.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE, random_state=SEED, stratify=STRATIFY)
model_2 = make_pipeline( StandardScaler(),
    SVC(kernel='linear', probability=True, random_state=SEED)).fit(X_train, y_train)
pred = model_2.predict(X_test)
prob = model_2.predict_proba(X_test)[:,1]
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [10]:
modelnumber='Model_2'
print_results(modelnumber, y_test,pred,prob, tn, fp)

Model_2 accuracy 0.8345070422535211 precision 0.7804878048780488 recall 0.45714285714285713 F1 0.5765765765765766 specificity 0.9579439252336449 AUC 0.7839953271028038


In [11]:
#Model_3
narrative = ["desire", "family", "job", "money", "student"]
narratives = {}
for nar in narrative:
    terms = set()
    for term in Path("narratives", f"{nar}.txt").read_text().splitlines():
        if term.strip():
            terms.add(term.strip().lower())
    narratives[nar] = terms

def narrative_toVector(text: str) -> np.ndarray:
    words = text.lower().split()
    num_words = len(words) or 1
    results = []
    for nar in narrative:
        count = 0
        for word in words:
            if word in narratives[nar]:
                count += 1
        results.append(count / num_words)
    return np.array(results)

X = np.array([narrative_toVector(t) for t in df['request_text'].fillna('')])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE, random_state=SEED, stratify=STRATIFY)
model_3 = make_pipeline( StandardScaler(),
    SVC(kernel='linear', probability=True, random_state=SEED)).fit(X_train, y_train)
pred = model_3.predict(X_test)
prob = model_3.predict_proba(X_test)[:,1]
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [12]:
modelnumber='Model_3'
print_results(modelnumber, y_test,pred,prob, tn, fp)

Model_3 accuracy 0.7535211267605634 precision 0.0 recall 0.0 F1 0.0 specificity 1.0 AUC 0.5231642189586114


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
#Model_4
def map_dic(id_str):
    try: dic_id = int(id_str)
    except ValueError: return None
    if 1 <= dic_id <= 2: return 'care'
    if 5 <= dic_id <= 6: return 'loyalty'
    if 7 <= dic_id <= 8: return 'authority'
    if 9 <= dic_id <= 10: return 'sanctity'
    else: return None
        
dims = ['care', 'loyalty', 'authority', 'sanctity']
dim_dic = {dim: [] for dim in dims}
with Path('MoralFoundations.dic').open(encoding='utf-8') as dic:
    line = dic.readline() 
    while line:
        line = line.strip()
        if not line or line[0] in {'#', '%'} or len(re.split(r'\s+', line)) < 2:
            line = dic.readline()
            continue
        tokens = re.split(r'\s+', line)
        for dic_id in tokens[1:]:
            dim = map_dic(dic_id)
            if dim:
                dim_dic[dim].append(tokens[0].replace('*', r'\w*'))
        line = dic.readline()  
        
patterns = {}        
for key, val in dim_dic.items():
    if terms:
        regex = r'\b(?:' + '|'.join(val) + r')\b'
        pattern = re.compile(regex, flags=re.IGNORECASE)
        patterns[key] = pattern

def mf_vector(text: str) -> np.ndarray:
    if not text.strip():
        return np.zeros(4)
    total = 1
    if len(text.split()) != 0: total = len(text.split())
    ratios = []
    for key in dims:
        matches = patterns[key].findall(text)
        ratio = len(matches) / total
        ratios.append(ratio)
    return np.array(ratios)

X = np.vstack(df['request_text'].fillna('').map(mf_vector))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=SIZE, random_state=SEED, stratify=STRATIFY)
model_4 = make_pipeline(
    StandardScaler(),
    SVC(kernel='linear', C=10, class_weight='balanced', probability=True, random_state=42)).fit(X_train, y_train)
pred = model_4.predict(X_test)
prob = model_4.predict_proba(X_test)[:,1]
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

In [14]:
modelnumber='Model_4'
print_results(modelnumber, y_test,pred,prob, tn, fp)

Model_4 accuracy 0.7306338028169014 precision 0.11764705882352941 recall 0.014285714285714285 F1 0.025477707006369428 specificity 0.9649532710280374 AUC 0.4714869826435246


In [15]:
modelnumber='Model_4'
print_results(modelnumber, y_test,pred,prob, tn, fp)

Model_4 accuracy 0.7306338028169014 precision 0.11764705882352941 recall 0.014285714285714285 F1 0.025477707006369428 specificity 0.9649532710280374 AUC 0.4714869826435246
