# IMPORT LIBRARIES AND LOAD DATASET

In [None]:
import xmltodict
from html.parser import HTMLParser
import re
import pandas as pd
from io import StringIO
from csv import writer 

pd.set_option('max_colwidth', 1000)
class Data(object): pass

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans, AgglomerativeClustering

# This notebook expects the travel.stackexchange.com Posts.xml file to exist under ./travel_questions/Posts.xml
# It can be obtained at: https://archive.org/details/stackexchange

questions_xml_posts = open('./travel_questions/Posts.xml','r').read()
questions_data = xmltodict.parse(questions_xml_posts)['posts']

# FILTER THE DATASET

In [69]:
#https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def clean_text_body(text_body):
    clean_content = strip_tags(text_body)
    clean_content = clean_content.lower()
    clean_content = re.sub(r'\[.*\..*]', '', clean_content)
    clean_content = clean_content.replace("'", "")
    
    puncts='.?!),;"\，=-:&' #<>
    for sym in puncts:
       clean_content = clean_content.replace(sym,' ')

    blacklisted_tokens = ['\n']
    for blacklisted_token in blacklisted_tokens:
        clean_content = clean_content.replace(blacklisted_token,' ')
        
    clean_content = re.sub('\s+', ' ', clean_content).strip()
        
    return clean_content

def check_if_should_be_skipped(clean_title, clean_question_body, clean_answer_body):
    blacklisted_tokens = ['<code>']
    for blacklisted_token in blacklisted_tokens:
        if (blacklisted_token in clean_question_body or blacklisted_token in clean_answer_body):
            return True
        if (len(answer_body) < 10 or len(answer_body) > 500):
            return True
    return False

all_posts = {}
for row in questions_data['row']:
    row_data = {
        "body":         row['@Body'],
        "post_type_id": row['@PostTypeId'],
    }
    
    title = "missing"
    if ('@Title' in row):
        row_data['title'] = row['@Title']

    if ('@AcceptedAnswerId' in row):
        row_data['accepted_answer_id'] = row['@AcceptedAnswerId']
    
    row_id = row['@Id']
    all_posts[row_id] = row_data    

# CREATE QUESTION / ANSWER PAIRS
print("Creating question/answer pairs..")
question_answer_pairs = []
for post_id in all_posts:
    post = all_posts[post_id]
    if (post['post_type_id'] == "1" and 'accepted_answer_id' in post): # It's a question and there is an answer
        question_body      = post['body']
        accepted_answer_id = post['accepted_answer_id']
        title              = post['title']
        answer_body = all_posts[accepted_answer_id]['body']

        clean_question_body = clean_text_body(question_body)
        clean_title_body    = clean_text_body(title)
        clean_answer_body   = clean_text_body(answer_body)

        
        should_skip = check_if_should_be_skipped(clean_title_body, clean_question_body, answer_body)
        if (not should_skip):
            question_answer_pairs.append([post_id, clean_title_body, clean_answer_body, "unknown", "no"])
        
# CREATE DATAFRAME
print("Creating dataframe..")
output = StringIO()
csv_writer = writer(output)
for pair in question_answer_pairs:
    csv_writer.writerow(pair)
output.seek(0) # we need to get back to the start of the StringIO
question_answer_df = pd.read_csv(output, names=['id', 'title', 'answer', 'classification', 'classified_by_model'])

print("Created a dataframe with %d rows" % question_answer_df.shape[0])
question_answer_df.head(3)

Creating question/answer pairs..
Creating dataframe..
Created a dataframe with 2645 rows


Unnamed: 0,id,title,answer,classification,classified_by_model
0,13,where can i find up to date information about roadblocks and strikes in peru and bolivia,g adventures provide a lot of south american tours they also monitor any safety updates in that region for their travellers heres a timeline of updates and safety info within the last few months http //www gadventures com/safety updates/,unknown,no
1,16,is there a good website to plan a trip via trains in europe,seat 61 is the absolute definitive guide for international rail travel it has all the information you need about routes prices and schedule it also has plenty of links to the places where you can price up and buy tickets and where to buy them if you cant buy them online http //seat61 com/ if you have more specific questions you can ask them here obviously,unknown,no
2,27,what are some good ways to find things to explore on site in an unfamiliar place,the best advice you can get is from the guy that work late night in your hotel ask for things that he/she does in their free hours why late night usually they have some concierge services that work only during the day and try to give you the standard advices i also got some very good deals asking taxi drivers but the better ones were from bus drivers so the advice is ask someone local try something like what would you do if you want to,unknown,no


# CREATE A TOPIC CLASSIFIER
##### To achieve it with this test dataset I will use the keyword 'visa' which should tag questions related to the 'visa' topic

In [71]:
question_answer_df.loc[question_answer_df['title'].str.contains('visa'),"classification"] = "visa_question"
question_answer_df[question_answer_df['classification'] == "visa_question"].head(5)

Unnamed: 0,id,title,answer,classification,classified_by_model
13,295,can i apply for a russian visa from the country georgia,embassy of switzerland in georgia russian federation interests section is issuing the russian visas in tbilisi but not the touristic ones unfortunately so you must apply for a visa in australia,visa_question,no
15,350,can i pay for my turkish visa on arrival at the airport in euros,the visas used to be 15€ or 20$ and paying in turkish liras was not an option there was an atm right next to the visa booth however as of the 10th of april this year visas are now applied and paid for online at https //www evisa gov tr/en/ eu citizens can get the visa at electronic booths in the airport but getting it online prior to arrival is cheaper,visa_question,no
28,573,getting a chinese tourist visa with an expired uk criminal conviction,asked a friend about this one who applied years ago he was denied even though his conviction was spent it might be based on the severity of his conviction but i cannot attest to that the best advice i was given was go to hong kong and apply for a chinese visa there according to sources (other travelers nothing written you are more likely to get one there if you dont want to go all the way to hong kong to try (which is understandable apply and see if he gets one,visa_question,no
31,701,how long does it take to process a french working holiday visa,the french consulate web site says the working holiday visa takes an average of 2 weeks but 1 week is generally ok we have an applicant of our ski jobs france program applying tomorrow so will let you know the latest processing time the quickest time this year has been 3 days but it does vary,visa_question,no
40,934,penalty for overstaying tourist visa in taiwan,penalties are levied via a fine the fine ranges from around 1000 ntd (if you overstay by an hour up to 10000 ntd source (page 5 http //iff immigration gov tw/public/data/11714474471 pdf,visa_question,no


##### Then we will create a model based on the tagged data

In [15]:
def create_classifier_model(question_answer_df, feature_column, op_penalty, op_ngram, op_alpha):
    model_data = Data()
    if (feature_column == "title"):
        (model_data.X, model_data.y_names) = (question_answer_df[[feature_column,'classification']].pipe(lambda x:(x.title.values, x.classification.values)))
    elif (feature_column == "answer"):
        (model_data.X, model_data.y_names) = (question_answer_df[[feature_column,'classification']].pipe(lambda x:(x.answer.values, x.classification.values)))
    else:
        raise ValueError("Unknown feature column: %s" % feature_column)

    model_data.label_encoder           = preprocessing.LabelEncoder().fit(model_data.y_names)
    model_data.label_dict              = dict(enumerate(model_data.label_encoder.classes_))
    model_data.y                       = model_data.label_encoder.transform(model_data.y_names)

    model_data.X_train, model_data.X_test, model_data.y_train, model_data.y_test = train_test_split(model_data.X, model_data.y, random_state=0, test_size=.2)

    op_analyzer = 'word'
    model_data.clf = Pipeline([('vect',TfidfVectorizer(analyzer=op_analyzer, strip_accents=None, ngram_range=op_ngram)),
                        ('clf', SGDClassifier(alpha=op_alpha, penalty=op_penalty) )
    ])

    print('Fitting the model...')
    model_data.clf.fit(model_data.X_train, model_data.y_train)

    print('Predicting on the test set...')
    model_data.y_pred = model_data.clf.predict(model_data.X_test)

    print("Results:")
    print(metrics.classification_report(model_data.y_test, 
    model_data.y_pred, target_names=[x for x in model_data.label_encoder.classes_ if x != ' '], digits=3))

    cm = metrics.confusion_matrix(model_data.y_test, model_data.y_pred)
    print("Confusion matrix:")
    print(cm)
    
    return model_data

print("Creating model based on questions tagged for the VISA topic")
questions_model_data = create_classifier_model(question_answer_df, 'title', op_penalty="l1", op_ngram=(1,5), op_alpha=1e-4)


Creating model based on questions tagged for the VISA topic
Fitting the model...
Predicting on the test set...
Results:
               precision    recall  f1-score   support

      unknown      1.000     1.000     1.000       429
visa_question      1.000     1.000     1.000       100

     accuracy                          1.000       529
    macro avg      1.000     1.000     1.000       529
 weighted avg      1.000     1.000     1.000       529

Confusion matrix:
[[429   0]
 [  0 100]]


## USE INITIAL CLASSIFIER TO EXPAND OUR DATASET OF VISA QUESTIONS

In [28]:
def predict_on_dataframe_using_model(dataframe, column_feature, threshold, classified_by, update_dataframe):
    possible_features_to_tag = []
    for index in question_answer_df.index:
        if (question_answer_df.at[index, 'classification'] == "unknown"):
            feature = question_answer_df.at[index, column_feature]
            # predict the classification
            predicted_as_visa_topic = predict_if_visa_topic(questions_model_data, feature, threshold)
            if (predicted_as_visa_topic is True):
                print("Updating title (%s) in dataframe as visa topic" % feature)
                possible_features_to_tag.append(feature)
                if (update_dataframe):
                    question_answer_df.at[index, 'classification']      = "visa_question"
                    question_answer_df.at[index, 'classified_by_model'] = classified_by

    return possible_features_to_tag

def predict_if_visa_topic(conversations, title, threshold):
    res = conversations.clf.decision_function ([title])
    if (res > threshold):
        return True
    return False
    
print("Will now try to find more titles that might relate to the Visa topic based on the question classifier..\n")
possible_features_to_tag = predict_on_dataframe_using_model(dataframe=question_answer_df, column_feature='title', 
threshold=-0.99, classified_by="question_based_model", update_dataframe=False)
question_answer_df[question_answer_df['classified_by_model'] == "question_based_model"]


Will now try to find more titles that might relate to the Visa topic based on the question classifier..

Updating title (dual citizen with a canadian passport) in dataframe as visa topic
Updating title (where do the singers sing on the macys thanksgiving day parade route) in dataframe as visa topic
Updating title (why does immigration check passports at some schengen only terminals) in dataframe as visa topic
Updating title (is there a website that will tell me the duration of a flight) in dataframe as visa topic
Updating title (does a uk/us dual citizen toddler need an esta to visit the us) in dataframe as visa topic
Updating title (country of residence for booking a train ticket for intra schengen travel) in dataframe as visa topic
Updating title (multiple entry into korea) in dataframe as visa topic
Updating title (syrian citizen with schengen residence permit traveling to romania) in dataframe as visa topic
Updating title (travel within the eu with only a romanian id card) in dataf

Unnamed: 0,id,title,answer,classification,classified_by_model


## NOW WE CAN TAKE A LOOK AT THE ANSWERS OF THE TAGGED QUESTIONS

In [29]:
question_answer_df[question_answer_df['classification'] == "visa_question"][['answer','classification']].head(5)

Unnamed: 0,answer,classification
13,embassy of switzerland in georgia russian federation interests section is issuing the russian visas in tbilisi but not the touristic ones unfortunately so you must apply for a visa in australia,visa_question
15,the visas used to be 15€ or 20$ and paying in turkish liras was not an option there was an atm right next to the visa booth however as of the 10th of april this year visas are now applied and paid for online at https //www evisa gov tr/en/ eu citizens can get the visa at electronic booths in the airport but getting it online prior to arrival is cheaper,visa_question
28,asked a friend about this one who applied years ago he was denied even though his conviction was spent it might be based on the severity of his conviction but i cannot attest to that the best advice i was given was go to hong kong and apply for a chinese visa there according to sources (other travelers nothing written you are more likely to get one there if you dont want to go all the way to hong kong to try (which is understandable apply and see if he gets one,visa_question
31,the french consulate web site says the working holiday visa takes an average of 2 weeks but 1 week is generally ok we have an applicant of our ski jobs france program applying tomorrow so will let you know the latest processing time the quickest time this year has been 3 days but it does vary,visa_question
40,penalties are levied via a fine the fine ranges from around 1000 ntd (if you overstay by an hour up to 10000 ntd source (page 5 http //iff immigration gov tw/public/data/11714474471 pdf,visa_question


## CREATE A CLASSIFIER BASED ON THE ANSWERS

In [44]:
print("Creating model based on answers tagged for the VISA topic")
answers_model_data = create_classifier_model(question_answer_df, 'answer', 
                                             op_penalty="l2", op_ngram=(1,10), op_alpha=1e-5)

Creating model based on answers tagged for the VISA topic
Fitting the model...
Predicting on the test set...
Results:
               precision    recall  f1-score   support

      unknown      0.939     0.935     0.937       429
visa_question      0.725     0.740     0.733       100

     accuracy                          0.898       529
    macro avg      0.832     0.837     0.835       529
 weighted avg      0.899     0.898     0.898       529

Confusion matrix:
[[401  28]
 [ 26  74]]


## FIND NEW QUESTIONS PREVIOUSLY UNKNOWN WITH A PARTICULAR THRESHOLD

In [45]:
threshold = 0.3
print("Will now try to find more titles that might relate to the Visa topic based on the answer classifier:\n")
for index in question_answer_df.index:
    if (question_answer_df.at[index, 'classification'] == "unknown"):
        title = question_answer_df.at[index, 'title']
        # predict the classification
        predicted_as_visa_topic = predict_if_visa_topic(answers_model_data, title, threshold)
        if (predicted_as_visa_topic is True):
            question_answer_df.at[index, 'classification']      = "visa_question"
            question_answer_df.at[index, 'classified_by_model'] = "answers_based_model"

print("Now displaying the tagged questions found through the answers classification model")
question_answer_df[question_answer_df['classified_by_model'] == "answers_based_model"]

Will now try to find more titles that might relate to the Visa topic based on the answer classifier:

Now displaying the tagged questions found through the answers classification model


Unnamed: 0,id,title,answer,classification,classified_by_model
149,4020,how long does it take for a global entry application to be approved,although they say the process may take 4 6 weeks when i did it it only took a couple of weeks after submitting my application at that point i had to make an appointment to go down to the airport for an interview i was able to find an appointment date easily the process at the airport took about half an hour and then i was good to go right away,visa_question,answers_based_model
528,21030,can i visit uk with italian stay permit,the uk is not part of the schengen agreement so you would need a seperate visa the only situation you wouldnt is discussed inthis answer,visa_question,answers_based_model
856,43516,two passports entering schengen,if no other rules (like you are a citizen apply then you can choose whichever passport is more convenient for you if you have one schengen and one non schengen passport it would make little sense to use the non schengen passport in europe unless you really like lines and forms,visa_question,answers_based_model
963,49401,uk visitor entry clearance application refused impact on future travel,its common as you also missed attaching the required documents it happens i would suggest you to re apply with those documents missed chance of getting approved is more if it valid it wont affect your future applications,visa_question,answers_based_model
1285,68323,why does immigration check passports at some schengen only terminals,as of 2016 the manned passport checks at prague airport were replaced with boarding pass scanners as a matter of fact i didnt once have to show an id flying prg bcn today therefore the question is now moot,visa_question,answers_based_model
1520,81321,flight between non schengen countries with schengen connection after exhausting 90/180 limit,for the purpose of calculating the duration of stay in the schengen area you need only consider travel that passes through schengen border controls as you are in transit in cdg you should not have to pass through border control you will want to make sure of this though if you have already reached your maximum of 90 days an airport transit visa is not necessary and wont help in any case,visa_question,answers_based_model
1951,102658,multiple entry into korea,no its a single entry jeju is part of south korea so a visa valid for south korea is valid for jeju as well,visa_question,answers_based_model
1983,103853,syrian citizen with schengen residence permit traveling to romania,as stated in timatic the database used by airlines visa required except for passengers with a residence permit issued by netherlands for a maximum stay of 90 days so no he does not need a visa he just has to present his passport and residence permit card at the border,visa_question,answers_based_model
2029,105767,does the transit at budapest on a train from belgrade to zagreb count as staying in the schengen area,yes this counts as entering the schengen area if you will be in hungary at midnight it will even count as two days with trains youre generally processed into and out of each country when you actually cross the border there are some exceptions but the trains you are contemplating are not among them,visa_question,answers_based_model
2036,106049,can i change the travel dates on my schengen application form before the date of the interview,as mentioned in my comment this is a total non issue simply point it out as you hand over the documents in my case i was told that it was fine i didnt need to do anything,visa_question,answers_based_model


# SAMPLE CLUSTERS FOR ACCEPTANCE

In [67]:
def cluster_data(documents, clusters=3, features = 1000):
    vectorizer = TfidfVectorizer(stop_words='english', analyzer='word', strip_accents='ascii', ngram_range=(1,8), max_features=features)
    size = len(documents)
    X = vectorizer.fit_transform(documents)
    cluster_model = AgglomerativeClustering (n_clusters=clusters)
    labels = cluster_model.fit_predict(X.toarray())
    
    clustered_titles = {}
    for counter in range(len(labels)):
        label = labels[counter]
        if label not in clustered_titles:
            clustered_titles[label] = []
        clustered_titles[label].append(documents[counter])

    return clustered_titles

titles_to_cluster = question_answer_df[question_answer_df['classified_by_model'] == "answers_based_model"]['title'].values
clustered_titles = cluster_data(titles_to_cluster)
for cluster in sorted(clustered_titles):
    print("Cluster %d:\n%s" % (cluster, clustered_titles[cluster]))


Cluster 0:
['how long does it take for a global entry application to be approved', 'two passports entering schengen', 'uk visitor entry clearance application refused impact on future travel', 'why does immigration check passports at some schengen only terminals', 'flight between non schengen countries with schengen connection after exhausting 90/180 limit', 'multiple entry into korea', 'syrian citizen with schengen residence permit traveling to romania', 'can i change the travel dates on my schengen application form before the date of the interview', 'will i get refund on my application if i cancel now', 'schengen change port of entry', 'what is the impact in travel history after visiting russian federation can i apply for schengen']
Cluster 1:
['does the transit at budapest on a train from belgrade to zagreb count as staying in the schengen area', 'staying 92 days in the schengen area', 'returning to us after illegally working in the schengen area', 'travelling into schengen area afte