## Imports

In [1]:
from pymongo import MongoClient
client = MongoClient("mongodb://smt483:SMT483tls@10.0.104.84:27017/smt483")

import string
import regex as re
import pandas as pd

from transformers import pipeline

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder



## Preprocessing

In [2]:
def preprocessing(text):
    text = text.encode('ascii', errors="ignore").decode()
    text = "".join([ch for ch in text if ch in string.printable])
    text = text.replace("\n", "").replace("\nl", "").replace("[", "").replace("]", "").replace("\\","").replace("--", "").replace("|:-", "").replace("|", " ").replace("#", "").replace("&x200B;", "").replace("Read the full story here:", "").replace("More short stories here:", "").replace("Full story here:", "").replace("Full story and details here:", "").replace("More details here:", "").replace("More short stories here:", "")

    remove_reader_contribution_tags = re.sub('<Reader Contribution\W?[\w*\s*]*\>', '', text)
    remove_credits_tags = re.sub('<Credits:\W?[\w*\s*]*\>', '', remove_reader_contribution_tags)
    markdown_removed = re.sub('\*+\W+', '', remove_credits_tags)
    link_removed = re.sub('\(?https?://[A-Za-z0-9./_\-!@#$%^&*+={}[\]<>:;?]*\)?', '', markdown_removed)

    return link_removed

## Zero-Shot Intent Classifier

In [4]:
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [5]:
def classify_intent(text):
    labels = ["suggestion", "complaint", "educational", "question", "remark/news/statement", "seeking/giving advice"]

    results_dict = classifier(text, labels)

    labels = results_dict["labels"]
    intent = labels[0]

    return intent

### Testing

In [6]:
text = "I was eating at the hawker center last Sunday when a random person decided to just sit opposite me and started eating without even asking if I'm okay with that. Since we are eating, we both were unmasked. We were in close contact for more than 10 mins. There were no other seats so I couldn't move away. There were no other instances where I could have gotten it as that was the only time I was unmasked while being in close proximity with a stranger.The very rule that was supposed to stop the spread was what spread it to me. If the smm didn't exist, there would have been more tables available and that stranger would have found a table elsewhere.TLDR: Random stranger sat opposite me while eating at hawker center cos of lack of tables due to SMM. Tested positive and after looking back, that incident was most probably where I got the infection."

intent = classify_intent(text)
intent

'remark/news/statement'

In [7]:
fb_posts = client.smt483.fb_posts
fb_posts_df = pd.DataFrame(list(fb_posts.find()))
fb_posts_df = fb_posts_df[["message"]]

fb_posts_df.head(1)

Unnamed: 0,message
0,Who thinks that McDonalds have been giving sma...


In [18]:
fb_posts_df_sample = fb_posts_df[:500]

fb_posts_df_sample["cleantext"] = fb_posts_df_sample["message"].apply(preprocessing)
fb_posts_df_sample["intent_label"] = fb_posts_df_sample["cleantext"].apply(classify_intent)

fb_posts_df_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fb_posts_df_sample["cleantext"] = fb_posts_df_sample["message"].apply(preprocessing)


## Evaluation

In [9]:
data_labelled = pd.read_excel("../data/data labelling.xlsx", sheet_name='reddit', index_col=1)
data_labelled.reset_index(inplace=True)
data_labelled = data_labelled[["title", "text", "combined_text", "thoughtful?\n1 - yes, 0 - no", "intent\n1 - suggestion, 2 - complaint, \n3 - educational, 4 - question\n5- remark / news, 6 - seeking/giving advice"]]

data_labelled.rename(columns={"thoughtful?\n1 - yes, 0 - no": "isThoughtful?", "intent\n1 - suggestion, 2 - complaint, \n3 - educational, 4 - question\n5- remark / news, 6 - seeking/giving advice": "true_intent_label"}, inplace=True)

data_labelled.head(1)

Unnamed: 0,title,text,combined_text,isThoughtful?,true_intent_label
0,F15 and the moon. Taken outside Fernvale Prima...,,F15 and the moon. Taken outside Fernvale Prima...,0.0,5.0


In [10]:
data_labelled["cleantext"] = data_labelled["combined_text"].apply(preprocessing)
data_labelled["pred_intent_label"] = data_labelled["cleantext"].apply(classify_intent)

data_labelled

Unnamed: 0,title,text,combined_text,isThoughtful?,true_intent_label,cleantext,pred_intent_label
0,F15 and the moon. Taken outside Fernvale Prima...,,F15 and the moon. Taken outside Fernvale Prima...,0.0,5.0,F15 and the moon. Taken outside Fernvale Prima...,remark/news/statement
1,Anyone hear a deep rumbling in AMK?,Been going on for about five minutes now. Pret...,Anyone hear a deep rumbling in AMK? Been going...,0.0,4.0,Anyone hear a deep rumbling in AMK? Been going...,question
2,Experiences of Wisdom Tooth Surgery,Hey guys! I’m about to go for my wisdom tooth ...,Experiences of Wisdom Tooth Surgery Hey guys! ...,1.0,3.0,Experiences of Wisdom Tooth Surgery Hey guys! ...,seeking/giving advice
3,We're trying to create a safe space for youths...,"Hey r/singapore! I'm from Shy, a group of 3 fr...",We're trying to create a safe space for youths...,1.0,3.0,We're trying to create a safe space for youths...,remark/news/statement
4,World of warcraft players,Hi there.. returning wow player here. looking ...,World of warcraft players Hi there.. returning...,0.0,5.0,World of warcraft players Hi there.. returning...,question
...,...,...,...,...,...,...,...
495,Carousell help for a us buyer,[removed],Carousell help for a us buyer [removed],0.0,5.0,Carousell help for a us buyer removed,seeking/giving advice
496,4 Simple ways to improve your #English Languag...,,4 Simple ways to improve your #English Languag...,0.0,5.0,4 Simple ways to improve your English Language...,educational
497,What are the best price plans from mobile phones?,title,What are the best price plans from mobile phon...,0.0,5.0,What are the best price plans from mobile phon...,seeking/giving advice
498,"Content of Eucerin Advanced Repair Cream, Body...",,"Content of Eucerin Advanced Repair Cream, Body...",0.0,5.0,"Content of Eucerin Advanced Repair Cream, Body...",remark/news/statement


In [25]:
# Encoding Intent Labels

# def encode_labels(df, true_label_col, pred_label_col):
#     labelencoder = LabelEncoder()
#     labelencoder.fit(df[true_label_col])

#     labelencoder_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
#     labels = list(labelencoder_mapping.keys())

#     df["true_encoded_topic"] = labelencoder.transform(df[true_label_col])
#     df["pred_encoded_topic"] = labelencoder.transform(df[pred_label_col])

#     return df

mapping = {'suggestion': 1, 'complaint': 2, 'educational': 3, 'question': 4, 'remark/news/statement': 5, 'seeking/giving advice': 6}

data_labelled.replace({'pred_intent_label': mapping}, inplace=True)
data_labelled


Unnamed: 0,title,text,combined_text,isThoughtful?,true_intent_label,cleantext,pred_intent_label
0,F15 and the moon. Taken outside Fernvale Prima...,,F15 and the moon. Taken outside Fernvale Prima...,0.0,5.0,F15 and the moon. Taken outside Fernvale Prima...,5
1,Anyone hear a deep rumbling in AMK?,Been going on for about five minutes now. Pret...,Anyone hear a deep rumbling in AMK? Been going...,0.0,4.0,Anyone hear a deep rumbling in AMK? Been going...,4
2,Experiences of Wisdom Tooth Surgery,Hey guys! I’m about to go for my wisdom tooth ...,Experiences of Wisdom Tooth Surgery Hey guys! ...,1.0,3.0,Experiences of Wisdom Tooth Surgery Hey guys! ...,6
3,We're trying to create a safe space for youths...,"Hey r/singapore! I'm from Shy, a group of 3 fr...",We're trying to create a safe space for youths...,1.0,3.0,We're trying to create a safe space for youths...,5
4,World of warcraft players,Hi there.. returning wow player here. looking ...,World of warcraft players Hi there.. returning...,0.0,5.0,World of warcraft players Hi there.. returning...,4
...,...,...,...,...,...,...,...
495,Carousell help for a us buyer,[removed],Carousell help for a us buyer [removed],0.0,5.0,Carousell help for a us buyer removed,6
496,4 Simple ways to improve your #English Languag...,,4 Simple ways to improve your #English Languag...,0.0,5.0,4 Simple ways to improve your English Language...,3
497,What are the best price plans from mobile phones?,title,What are the best price plans from mobile phon...,0.0,5.0,What are the best price plans from mobile phon...,6
498,"Content of Eucerin Advanced Repair Cream, Body...",,"Content of Eucerin Advanced Repair Cream, Body...",0.0,5.0,"Content of Eucerin Advanced Repair Cream, Body...",5


In [27]:
def get_evaluation(df, true_label_col, pred_label_col, label_cat):
    y_pred = df[pred_label_col].values.flatten()
    y_true = df[true_label_col].values.flatten()

    return classification_report(y_true, y_pred, target_names=label_cat)

In [28]:
labels = ["suggestion", "complaint", "educational", "question", "remark/news/statement", "seeking/giving advice"]

evaluation_report = get_evaluation(data_labelled, "true_intent_label", "pred_intent_label", labels)
print(evaluation_report)

                       precision    recall  f1-score   support

           suggestion       0.00      0.00      0.00         2
            complaint       0.10      0.33      0.16         9
          educational       0.00      0.00      0.00         4
             question       0.46      0.88      0.60        77
remark/news/statement       0.97      0.56      0.71       374
seeking/giving advice       0.38      0.68      0.49        34

             accuracy                           0.60       500
            macro avg       0.32      0.41      0.33       500
         weighted avg       0.83      0.60      0.66       500

