In [58]:
import pandas as pd

import pickle

from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

In [59]:
PreContentScore = pd.read_csv("PreContentScore.csv", engine='python')

queries_df = pd.read_csv("SearchApiResults.csv")
queries_df = queries_df[['Query', 'url', 'ContentId', 'fancy_title']]
queries_df = queries_df[queries_df['ContentId'].isna() != True]

PreContentScore_clean = PreContentScore[PreContentScore.columns[3:]]
PreContentScore_clean = PreContentScore_clean[PreContentScore_clean['ContentId'].isna() != True]

In [60]:
all_query_df = pd.merge(queries_df, PreContentScore_clean, on = 'ContentId')
all_query_df.head()

Unnamed: 0,Query,url,ContentId,fancy_title,Date,UserID,Title,FormattedBody,TotalViews,Source,SourceScore,RecencyRate,AuthorScore,TopicID,PreContentScore
0,brakes during taxi,https://forum.cirruspilots.org/t/41040/5,6D601E0C-14D7-41DB-BAEF-3A6333537AB8,Cirrus SR22 Brake problems,2012-11-21,3695.0,,"[quote user=""David Martin""]\r\nRobert that's w...",,Forum,2.0,0.000161,19536.0,9.0,0.035
1,brakes during taxi,https://forum.cirruspilots.org/t/65942/2,ADCD2721-0351-4DD7-8B07-AE423D3420F0,cirrus sr22 brake wear,2012-12-11,10652.0,,[8)],,Forum,2.0,0.000162,19902.0,0.0,0.0
2,brakes during taxi,https://forum.cirruspilots.org/t/25175/7,2A423C4C-B3E0-4C9C-95D1-A0A86ACE1465,"Overheated, Failed Brakes",2005-04-20,3156.0,,"Yes, the other guy had plenty of time to go ar...",,Forum,2.0,7.9e-05,2300.0,1.0,0.0
3,brakes during taxi,https://forum.cirruspilots.org/t/68466/1,AB750CAD-7103-48DC-BB4A-8B82B792B33E,Brake fire,2013-06-14,14208.0,,"Got a fresh annual. On taxi, brakes clamped n...",,Forum,2.0,0.000174,14.0,6.0,0.0
4,brakes during taxi,https://forum.cirruspilots.org/t/26578/1,97F47B9B-F055-4F1D-9C56-8B5FF2A3D32C,Brake Failure and Fire,2005-08-12,4281.0,,"Today, we had the right landing gear in our ai...",,Forum,2.0,8.1e-05,340.0,6.0,0.0


__Input LDA model__

In [61]:
# Load pre-trained model
filename = 'lda_model.pickle'
infile = open(filename,'rb')
lda_model = pickle.load(infile)

filename = 'id2word.pickle'
infile = open(filename,'rb')
id2word = pickle.load(infile)

stop_words = stopwords.words('english')

__Re-rank search results__

In [62]:
def topic_match(search_query):
    '''
    Extract the topic of searching query by LDA model trained by forum posts
    '''
    texts1 = [[word for word in doc.lower().split() if word not in stop_words] for doc in [search_query]]
    corpus1 = [id2word.doc2bow(t) for t in texts1]
    result = lda_model.get_document_topics(corpus1)
    LDA_topic = max(result[0], key=lambda x: x[1])
    
    return LDA_topic

In [63]:
def keyword_from_query(query):
    '''
    Extract keywords from query
    '''
    query_word = [word for word in query.split() if word not in stop_words]
    
    return query_word

In [64]:
def ratio_match(post):
    '''
    Count the number of word matching between search query and posts
    '''
    summa = 0
    for word in list(set(query_word)):
        ratio = post.count(word) / len(post)
        summa += ratio
    
    return summa

In [65]:
def count_match(post):
    '''
    Count the number of word matching between search query and posts
    '''
    summa = 0
    for word in list(set(query_word)):
        summa += post.count(word)
    
    return summa

In [66]:
def Normalization(col, df):
    '''
    Normalize data in three attributes
    '''
    nomolized_col = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    
    return nomolized_col

In [100]:
def assign_weights(weights, df):
    '''
    Assign different sets of weights to three attributes
    '''
    df_copy = df.copy()
    df_copy['TotalScore'] = df['RecencyRate'] * weights[0] + df['AuthorScore'] * weights[1] + df['TotalContentScore'] * weights[2]
    df_copy.sort_values("TotalScore", inplace = True, ascending=False)
    
    return df_copy

In [101]:
def reset_index(df):
    '''
    Reset the index of a dataframe
    '''
    df.reset_index(drop=True, inplace=True)
    
    return df

In [111]:
comb1 = ''
comb2 = ''
comb3 = ''

query_list = list(all_query_df['Query'].unique())
for query in query_list:
    # Take the subset of rearch results from the query
    sub_dataset = all_query_df[all_query_df['Query'] == query]
    sub_dataset['ContentScore'] = sub_dataset['PreContentScore']
    # Assign topic id to search query
    topic_id = topic_match(query)
    # Calculate content score
    sub_dataset.loc[sub_dataset['TopicID'] == topic_id, 'ContentScore'] = sub_dataset.loc[sub_dataset['TopicID'] == topic_id, 'PreContentScore'] * 5
    
    query_word = keyword_from_query(query)
    sub_dataset['Match'] = sub_dataset['FormattedBody'].apply(ratio_match)
    
    # Normalize data
    for col in ['RecencyRate', 'AuthorScore', 'ContentScore', 'Match']:
        sub_dataset.loc[:,col] = Normalization(col, sub_dataset)
    
    sub_dataset['TotalContentScore'] = 0.3 * sub_dataset['ContentScore'] + 0.7 * sub_dataset['Match']
    
    # Apply three sets of weights that add up to the total score
    temp1 = assign_weights([1/2, 2/9, 1/3], sub_dataset)
    temp2 = assign_weights([1/5, 3/10, 1/2], sub_dataset)
    temp3 = assign_weights([1/3, 1/3, 1/3], sub_dataset)
    
    if type(comb1) == str:
        comb1 = temp1
        comb2 = temp2
        comb3 = temp3
    else:
        # Combine search results for all search queries
        comb1 = pd.concat([comb1, temp1])
        comb2 = pd.concat([comb2, temp2])
        comb3 = pd.concat([comb3, temp3])

for df in [comb1, comb2, comb3]:
    reset_index(df)

In [107]:
comb1[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb1.csv') # 1/2, 2/9, 1/3
comb2[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb2.csv') # 1/5, 3/10, 1/2
comb3[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('count_comb3.csv') # 1/3, 1/3, 1/3

In [109]:
comb1[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb1.csv') # 1/2, 2/9, 1/3
comb2[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb2.csv') # 1/5, 3/10, 1/2
comb3[['Query', 'url', 'ContentId', 'fancy_title']].to_csv('ratio_comb3.csv') # 1/3, 1/3, 1/3