In [1]:
import pandas as pd

import pickle

from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

In [2]:
PreContentScore = pd.read_csv("PreContentScore.csv", engine='python')

queries_df = pd.read_csv("SearchApiResults.csv")
queries_df = queries_df[['Query', 'url', 'ContentId', 'fancy_title']]
queries_df = queries_df[queries_df['ContentId'].isna() != True]

PreContentScore_clean = PreContentScore[PreContentScore.columns[3:]]
PreContentScore_clean = PreContentScore_clean[PreContentScore_clean['ContentId'].isna() != True]

In [3]:
all_query_df = pd.merge(queries_df, PreContentScore_clean, on = 'ContentId')
all_query_df.head()

Unnamed: 0,Query,url,ContentId,fancy_title,UserID,Title,FormattedBody,TotalViews,Source,SourceScore,RecencyRate,AuthorScore,TopicID,PreContentScore
0,brakes during taxi,https://forum.cirruspilots.org/t/41040/5,6D601E0C-14D7-41DB-BAEF-3A6333537AB8,Cirrus SR22 Brake problems,3695,,"[quote user=""David Martin""]\r\nRobert that's w...",,Forum,2,0.000163,19536,9,0.035
1,brakes during taxi,https://forum.cirruspilots.org/t/65942/2,ADCD2721-0351-4DD7-8B07-AE423D3420F0,cirrus sr22 brake wear,10652,,[8)],,Forum,2,0.000164,19902,0,0.0
2,brakes during taxi,https://forum.cirruspilots.org/t/25175/7,2A423C4C-B3E0-4C9C-95D1-A0A86ACE1465,"Overheated, Failed Brakes",3156,,"Yes, the other guy had plenty of time to go ar...",,Forum,2,8e-05,2300,1,0.0
3,brakes during taxi,https://forum.cirruspilots.org/t/68466/1,AB750CAD-7103-48DC-BB4A-8B82B792B33E,Brake fire,14208,,"Got a fresh annual. On taxi, brakes clamped n...",,Forum,2,0.000176,14,6,0.0
4,brakes during taxi,https://forum.cirruspilots.org/t/26578/1,97F47B9B-F055-4F1D-9C56-8B5FF2A3D32C,Brake Failure and Fire,4281,,"Today, we had the right landing gear in our ai...",,Forum,2,8.2e-05,340,6,0.0


__Input LDA model__

In [4]:
# Load pre-trained model
filename = 'lda_model.pickle'
infile = open(filename,'rb')
lda_model = pickle.load(infile)

filename = 'id2word.pickle'
infile = open(filename,'rb')
id2word = pickle.load(infile)

stop_words = stopwords.words('english')

__Re-rank search results__

In [5]:
def topic_match(search_query):
    '''
    Extract the topic of searching query by LDA model trained by forum posts
    '''
    texts1 = [[word for word in doc.lower().split() if word not in stop_words] for doc in [search_query]]
    corpus1 = [id2word.doc2bow(t) for t in texts1]
    result = lda_model.get_document_topics(corpus1)
    LDA_topic = max(result[0], key=lambda x: x[1])
    
    return LDA_topic

In [6]:
def keyword_from_query(query):
    '''
    Extract keywords from query
    '''
    query_word = [word for word in query.split() if word not in stop_words]
    
    return query_word

In [7]:
def hit_WordPairs(query_word, wordPairs):
    '''
    Return a list of pre-defined word pairs that are contained by the search query
    '''
    query_string = ' '.join(query_word)
    matched_wordpair = list()
    for wordpair in wordPairs:
        if query_string.lower().count(wordpair) > 0:
            matched_wordpair.append(wordpair)
    return matched_wordpair

In [8]:
def ratio_wordpair_match(post):
    '''
    Count the ratio of word pair matching between search query and posts
    '''
    summa = 0
    
    matched_wordpairs = hit_WordPairs(query_word, wordPairs)
    
    if len(matched_wordpairs) != 0:
        for matched_wordpair in matched_wordpairs:
            ratio = post.lower().count(matched_wordpair) / len(post)
            summa += ratio
            
    return summa

In [9]:
def ratio_word_match(post):
    '''
    Count the ratio of word matching between search query and posts
    '''
    summa = 0
    
    for word in list(set(query_word)):
        ratio = post.lower().count(word) / len(post)
        summa += ratio
    
    return summa

In [10]:
def Normalization(col, df):
    '''
    Normalize data in three attributes
    '''
    nomolized_col = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    
    return nomolized_col

In [11]:
def assign_weights(weights, df):
    '''
    Assign different sets of weights to three attributes
    '''
    df_copy = df.copy()
    df_copy['TotalScore'] = df['RecencyRate'] * weights[0] + df['AuthorScore'] * weights[1] + df['TotalContentScore'] * weights[2]
    df_copy.sort_values("TotalScore", inplace = True, ascending=False)
    
    return df_copy

In [12]:
def reset_index(df):
    '''
    Reset the index of a dataframe
    '''
    df.reset_index(drop=True, inplace=True)
    
    return df

In [13]:
comb1 = ''
comb2 = ''
comb3 = ''

query_list = list(all_query_df['Query'].unique())
for query in query_list:
    # Take the subset of search results from the query
    sub_dataset = all_query_df[all_query_df['Query'] == query]
    sub_dataset['ContentScore'] = sub_dataset['PreContentScore']
    # Assign topic id to search query
    topic_id = topic_match(query)
    # Calculate content score
    sub_dataset.loc[sub_dataset['TopicID'] == topic_id, 'ContentScore'] = sub_dataset.loc[sub_dataset['TopicID'] == topic_id, 'PreContentScore'] * 5
    
    query_word = keyword_from_query(query)
    wordPairs = list(map(lambda x: x.lower(),list(pd.read_csv('WordPairs.csv')['Word Pairs'])))
    sub_dataset['word_Match'] = sub_dataset['FormattedBody'].apply(ratio_word_match)
    sub_dataset['wordpair_Match'] = sub_dataset['FormattedBody'].apply(ratio_wordpair_match)
    
    sub_dataset['Match'] = sub_dataset['word_Match'] + sub_dataset['wordpair_Match']
    
    # Normalize data
    for col in ['RecencyRate', 'AuthorScore', 'ContentScore', 'Match']:
        sub_dataset.loc[:,col] = Normalization(col, sub_dataset)
    
    sub_dataset['TotalContentScore'] = 0.3 * sub_dataset['ContentScore'] + 0.7 * sub_dataset['Match']
    
    # Apply three sets of weights that add up to the total score
    temp1 = assign_weights([1/2, 2/9, 1/3], sub_dataset)
    temp2 = assign_weights([1/5, 3/10, 1/2], sub_dataset)
    temp3 = assign_weights([1/3, 1/3, 1/3], sub_dataset)
    
    if type(comb1) == str:
        comb1 = temp1
        comb2 = temp2
        comb3 = temp3
    else:
        # Combine search results for all search queries
        comb1 = pd.concat([comb1, temp1])
        comb2 = pd.concat([comb2, temp2])
        comb3 = pd.concat([comb3, temp3])

for df in [comb1, comb2, comb3]:
    reset_index(df)

In [14]:
comb1[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb1.csv') # 1/2, 2/9, 1/3
comb2[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb2.csv') # 1/5, 3/10, 1/2
comb3[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb3.csv') # 1/3, 1/3, 1/3

In [15]:
comb1[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb1.csv') # 1/2, 2/9, 1/3
comb2[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb2.csv') # 1/5, 3/10, 1/2
comb3[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb3.csv') # 1/3, 1/3, 1/3