# DATA PREPARATION FOR TRAINING A DIALOGPT-2 MODEL

Code was referenced from [here](https://github.com/chuachinhon/practical_nlp/blob/master/notebooks/3.0_data_prep_cch.ipynb).

In [None]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    # find automatically the path of the folder containing "file_name" :
    file_name = 'dialo_data_prep.ipynb'
    import subprocess
    path_to_file = subprocess.check_output('find . -type f -name ' + str(file_name), shell=True).decode("utf-8")
    path_to_file = path_to_file.replace(file_name,"").replace('\n',"")
    # if previous search failed or too long, comment the previous line and simply write down manually the path below :
    #path_to_file = '/content/gdrive/My Drive/AI6103_2020_codes/codes/labs_lecture03/lab04_train_vanilla_nn'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd


Mounted at /content/gdrive
./gdrive/MyDrive/DNLP PROJECT/
/content/gdrive/MyDrive/DNLP PROJECT


In [None]:
import json
import pandas as pd
import re
from pandas import json_normalize
import nltk
nltk.download('wordnet')
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# PART 1: DATA EXTRACTION AND PREPARATION



Data set used: [SMS messages by Singaporean students at a local university](https://www.kaggle.com/rtatman/the-national-university-of-singapore-sms-corpus).

The data our group is using is a json file. The data set is nested, thus we need to extract the data into a dataframe format.

In [None]:
raw = [json.loads(line) for line in open('data/smsCorpus_en_2015.03.09_all.json', 'r')]

In [None]:
# Flatten the nested data
df_raw = json_normalize(raw)

In [None]:
df_raw.head()

Unnamed: 0,smsCorpus.@date,smsCorpus.@version,smsCorpus.message
0,2015.03.09,1.2,"[{'@id': 10120, 'text': {'$': 'Bugis oso near ..."


In [None]:
raw_messages = pd.concat(
    df_raw["smsCorpus.message"]
    .apply(pd.DataFrame)
    .tolist(),
    keys=df_raw["smsCorpus.@date"],
    sort=False,
).reset_index(level="smsCorpus.@date")


In [None]:
raw_messages.head()

Unnamed: 0,smsCorpus.@date,@id,text,source,destination,messageProfile,collectionMethod
0,2015.03.09,10120,{'$': 'Bugis oso near wat...'},"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown..."
1,2015.03.09,10121,"{'$': 'Go until jurong point, crazy.. Availabl...","{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown..."
2,2015.03.09,10122,{'$': 'I dunno until when... Lets go learn pil...,"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown..."
3,2015.03.09,10123,{'$': 'Den only weekdays got special price... ...,"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown..."
4,2015.03.09,10124,{'$': 'Meet after lunch la...'},"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown..."


In [None]:
# Retrieve sms text
raw_messages['sms_text'] = [x.get('$') for x in raw_messages['text']]

In [None]:
# Flatten data
source = json_normalize(raw_messages['source'], meta='@id')

destination = json_normalize(raw_messages['destination'], meta='@id')

profile = json_normalize(raw_messages['messageProfile'], meta='@id')

collection = json_normalize(raw_messages['collectionMethod'], meta='@id')


In [None]:
# concat the flatten data
sms_raw = pd.concat([raw_messages, source, destination, profile, collection], axis=1)

In [None]:
sms_raw.head()

Unnamed: 0,smsCorpus.@date,@id,text,source,destination,messageProfile,collectionMethod,sms_text,srcNumber.$,phoneModel.@manufactuer,phoneModel.@smartphone,userProfile.userID.$,userProfile.age.$,userProfile.gender.$,userProfile.nativeSpeaker.$,userProfile.country.$,userProfile.city.$,userProfile.experience.$,userProfile.frequency.$,userProfile.inputMethod.$,@country,destNumber.$,@language,@time,@type,@collector,@method,@time.1
0,2015.03.09,10120,{'$': 'Bugis oso near wat...'},"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown...",Bugis oso near wat...,51,unknown,unknown,51,unknown,unknown,unknown,SG,unknown,unknown,unknown,unknown,unknown,unknown,en,unknown,unknown,howyijue,unknown,2003/4
1,2015.03.09,10121,"{'$': 'Go until jurong point, crazy.. Availabl...","{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown...","Go until jurong point, crazy.. Available only ...",51,unknown,unknown,51,unknown,unknown,unknown,SG,unknown,unknown,unknown,unknown,unknown,unknown,en,unknown,unknown,howyijue,unknown,2003/4
2,2015.03.09,10122,{'$': 'I dunno until when... Lets go learn pil...,"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown...",I dunno until when... Lets go learn pilates...,51,unknown,unknown,51,unknown,unknown,unknown,SG,unknown,unknown,unknown,unknown,unknown,unknown,en,unknown,unknown,howyijue,unknown,2003/4
3,2015.03.09,10123,{'$': 'Den only weekdays got special price... ...,"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown...",Den only weekdays got special price... Haiz......,51,unknown,unknown,51,unknown,unknown,unknown,SG,unknown,unknown,unknown,unknown,unknown,unknown,en,unknown,unknown,howyijue,unknown,2003/4
4,2015.03.09,10124,{'$': 'Meet after lunch la...'},"{'srcNumber': {'$': 51}, 'phoneModel': {'@manu...","{'@country': 'unknown', 'destNumber': {'$': 'u...","{'@language': 'en', '@time': 'unknown', '@type...","{'@collector': 'howyijue', '@method': 'unknown...",Meet after lunch la...,51,unknown,unknown,51,unknown,unknown,unknown,SG,unknown,unknown,unknown,unknown,unknown,unknown,en,unknown,unknown,howyijue,unknown,2003/4


In [None]:
cols = [
    "@id",
    "userProfile.userID.$",
    "sms_text",
    "userProfile.country.$",
    "userProfile.age.$",
    "userProfile.gender.$",
    "srcNumber.$",
    "phoneModel.@manufactuer",
    "phoneModel.@smartphone",
    "userProfile.frequency.$",
]

sms = sms_raw[cols].copy()


In [None]:
# function to clean the text
def clean_text(text):    
    text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
    text = re.sub(r"http\S+", "", text) #replace websites with empty space
    text = re.sub(r"\n", " ", text) #replace newline with empty space
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\W", " ", text) #replace one non-word character with empty space
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) #replace digits with empty space
    text = text.strip(" ") #remove leading and trailing whitespaces
    text = re.sub(r"[^\w\s]", "", text) #remove any single non word character
    text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single space    
    return text


In [None]:
sms['sms_text'] = sms['sms_text'].astype('str')

sms["clean_text"] = sms['sms_text'].map(lambda text: clean_text(text))

# Comment out the stemming or lemmatization that you do not want to use
#sms["clean_text"] = sms["clean_text"].map(lambda text: PorterStemmer().stem(text))
sms["clean_text"] = sms["clean_text"].map(lambda text: LancasterStemmer().stem(text))
sms["clean_text"] = sms["clean_text"].map(lambda text: WordNetLemmatizer().lemmatize(text))

sms = sms.dropna(subset=['clean_text']) #drop rows which contain empty text

sms.head()

Unnamed: 0,@id,userProfile.userID.$,sms_text,userProfile.country.$,userProfile.age.$,userProfile.gender.$,srcNumber.$,phoneModel.@manufactuer,phoneModel.@smartphone,userProfile.frequency.$,clean_text
0,10120,51,Bugis oso near wat...,SG,unknown,unknown,51,unknown,unknown,unknown,bugis oso near wat
1,10121,51,"Go until jurong point, crazy.. Available only ...",SG,unknown,unknown,51,unknown,unknown,unknown,go until jurong point crazy available only in ...
2,10122,51,I dunno until when... Lets go learn pilates...,SG,unknown,unknown,51,unknown,unknown,unknown,i dunno until when lets go learn pilates
3,10123,51,Den only weekdays got special price... Haiz......,SG,unknown,unknown,51,unknown,unknown,unknown,den only weekdays got special price haiz cant ...
4,10124,51,Meet after lunch la...,SG,unknown,unknown,51,unknown,unknown,unknown,meet after lunch la


In [None]:
#adding a word count column for filtering

sms['word_count'] = sms['clean_text'].str.count(' ') + 1

In [None]:
# narrowing down columns

cols = ["@id", "userProfile.userID.$", "userProfile.country.$", "sms_text", "clean_text", "word_count"]

sms = sms[cols].copy()


In [None]:
# renaming columns

sms = sms.rename(
    columns={
        "@id": "data_id",
        "userProfile.userID.$": "user_id",
        "userProfile.country.$": "country",
        "sms_text": "sms_text",
        "clean_text": "clean_text",
        "word_count": "word_count",

    }
)


In [None]:
sms.shape

(55835, 6)

In [None]:
sms.head()

Unnamed: 0,data_id,user_id,country,sms_text,clean_text,word_count
0,10120,51,SG,Bugis oso near wat...,bugis oso near wat,4
1,10121,51,SG,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,20
2,10122,51,SG,I dunno until when... Lets go learn pilates...,i dunno until when lets go learn pilates,8
3,10123,51,SG,Den only weekdays got special price... Haiz......,den only weekdays got special price haiz cant ...,25
4,10124,51,SG,Meet after lunch la...,meet after lunch la,4


## REDUCING NOISE
Over here we can choose to filtered out SMSes of N words or less to reduce noise since SMSes that are too short provides too little information. We also choose to keep those SMSes sent by users in Singapore since we are making a Singlish chatbot.

In [None]:
#Calculate average word count

sms_wordCount = sms['word_count']
totalCount = 0

for x in sms_wordCount:
  totalCount += x
  averageWordCount = totalCount/len(sms_wordCount)

print(averageWordCount)

10.458189307781858


In [None]:
reduce1 = sms['word_count'] > 3
reduce2 = sms['country'] == 'SG'
reduce3 = sms['country'] == 'Singapore'

sms = sms[reduce1 & (reduce2 | reduce3)].copy().reset_index()

In [None]:
sms.shape

(29360, 7)

## 2. CREATING TRAINING AND VALIDATION SETS

For every response, the previous sms acts as the context since it is a conversation. We can choose the value of n, where n is the number of previous responses to serve as context.

After setting the number of SMSes to act as context, the data is saved as CSV files for training in the other notebook.

In [None]:
contexted = []

n = 7

for i in range(n, len(sms['clean_text'])):
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current response and 7 previous responses  
    for j in range(i, prev, -1):
        row.append(sms['clean_text'][j])
    contexted.append(row)  

In [None]:
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(n-1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [None]:
df.shape

(29353, 8)

In [None]:
df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,hey pple or for nights excellent location wif ...,nights we nt staying at port step liao too ex,m walking in citylink now faster come down me ...,meet after lunch la,den only weekdays got special price haiz cant ...,i dunno until when lets go learn pilates,go until jurong point crazy available only in ...,bugis oso near wat
1,yun ah the ubi one say if wan call by tomorrow...,hey pple or for nights excellent location wif ...,nights we nt staying at port step liao too ex,m walking in citylink now faster come down me ...,meet after lunch la,den only weekdays got special price haiz cant ...,i dunno until when lets go learn pilates,go until jurong point crazy available only in ...
2,hey tmr maybe can meet you at yck,yun ah the ubi one say if wan call by tomorrow...,hey pple or for nights excellent location wif ...,nights we nt staying at port step liao too ex,m walking in citylink now faster come down me ...,meet after lunch la,den only weekdays got special price haiz cant ...,i dunno until when lets go learn pilates
3,oh i asked for fun haha take care,hey tmr maybe can meet you at yck,yun ah the ubi one say if wan call by tomorrow...,hey pple or for nights excellent location wif ...,nights we nt staying at port step liao too ex,m walking in citylink now faster come down me ...,meet after lunch la,den only weekdays got special price haiz cant ...
4,we are supposed to meet to discuss abt our tri...,oh i asked for fun haha take care,hey tmr maybe can meet you at yck,yun ah the ubi one say if wan call by tomorrow...,hey pple or for nights excellent location wif ...,nights we nt staying at port step liao too ex,m walking in citylink now faster come down me ...,meet after lunch la


In [None]:
# Split the data into training andd validation set where test_size determines the size of the validation set.

train_df, validate_df = train_test_split(df, random_state=42, test_size=0.2)

In [None]:
train_df.shape, validate_df.shape

((23482, 8), (5871, 8))

In [None]:
# Uncomment the 2 lines to create the CSV files for training

train_df.to_csv('data/train_df.csv', index=False)
validate_df.to_csv('data/validate_df.csv', index=False)