In [2]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.metrics import accuracy_score,f1_score,classification_report,precision_score,recall_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

import spacy
import string
from string import digits
from bs4 import BeautifulSoup
from html import unescape
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = pd.read_csv('python_data .csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190234 entries, 0 to 190233
Data columns (total 19 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Unnamed: 0      190234 non-null  int64 
 1   Id              190234 non-null  int64 
 2   Title           190234 non-null  object
 3   QuestionBody    190234 non-null  object
 4   AnswerBody      190234 non-null  object
 5   QuestionTime    190234 non-null  object
 6   AnswerTime      190234 non-null  object
 7   AnswerTimeDiff  190234 non-null  int64 
 8   Tags            190234 non-null  object
 9   UserId          190234 non-null  int64 
 10  UserReputation  190234 non-null  int64 
 11  UserPageViews   190234 non-null  int64 
 12  UserUpVote      190234 non-null  int64 
 13  UserDownVotes   190234 non-null  int64 
 14  BadgeNum        190234 non-null  int64 
 15  Q_time_hr       190234 non-null  int64 
 16  Q_time_weekday  190234 non-null  int64 
 17  Q_Range         190234 non-nu

# 1 Text Cleaning 

In [4]:
# Data Cleaning
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [5]:
#en = spacy.load('en_core_web_sm')
#sw_spacy = en.Defaults.stop_words


stop_words = stopwords.words('english')



def remove_stopwords(text):
    stopwords_removed = ' '.join([word for word in text.split() if word not in stop_words])
    return stopwords_removed

In [6]:
# Setting for digit removing
remove_digits = str.maketrans('', '', digits)

In [7]:
# Setting for punctuation removing
remove_punkt = str.maketrans(string.punctuation,' '*len(string.punctuation))

In [8]:
lemmatizer = WordNetLemmatizer()
def get_lemmatized_text(text):
    lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return lemmatized

In [9]:
def text_processing(df_text):
    #  lower case
    df_text = df_text.str.lower()
    
    #  Decode html
    df_text = df_text.apply(unescape)
    
    #  Remove html
    df_text = df_text.apply(lambda x: remove_html(x))
    
    #  Remove stopwords
    df_text = df_text.apply(lambda x: remove_stopwords(x))

    #  Remove digits
    df_text = df_text.apply(lambda x: x.translate(remove_digits))
    
    #  Remove punctuation
    df_text = df_text.apply(lambda x: x.translate(remove_punkt))
    
    #  Lemmatization
    df_text_processed = df_text.apply(lambda x: get_lemmatized_text(x))
    
    return df_text_processed

In [34]:
def trial_processing(text):
    #lower case
    text = text.lower()
    #remove html
    text = unescape(text)
    text = remove_html(text)
    #remove stopwords 
    text = remove_stopwords(text)
    #remove digits 
    text = text.translate(remove_digits)
    #remove punctuation 
    text = text.translate(remove_punkt)
    #Lemmatization
    text = get_lemmatized_text(text)
    return text

In [11]:
def clean_tags(tags):
    #clean tags
    tags = re.sub(r'[,<>.？:]', ' ', tags)
    return tags

In [12]:
#clean text 
data['clean_question'] = text_processing(data['QuestionBody'])
data['clean_title'] = text_processing(data['Title'])
data['clean_Tags'] = data['Tags'].apply(clean_tags)

In [146]:
#get trial question 
tag = data['Tags'][10]
title = data['Title'][10]
question = data['QuestionBody'][10]

# 2. Filtering with SimTitle and SimTag

# 2.1 Calculate the similarity from title 

In [117]:
#get the tf-idf vectorizer for title 
tf_title = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0,max_features =500)
tfidf_title = tf_title.fit_transform(data['clean_title'])

In [147]:
def sim_title(title):
    #cleaning the input 
    title = trial_processing(title)
    title_asSeries = pd.Series(title)
    #transform the string input into a matrix 
    title_matrix = tf_title.transform(title_asSeries)

    title_trial1 = title_matrix.tocsr().todense()
    #calculate the cosine similarity 
    cosine_similarities = cosine_similarity(title_trial1, tfidf_title)
    #get index 
    similarity_indices = cosine_similarities[0].argsort()[::-1]
    #ger id and cosine similarity 
    similar_items = [(cosine_similarities[0][i], data['Id'][i]) for i in similarity_indices]


    result_title = {}
    #put id and cosine similarity in the dict 
    for i in similar_items:
        result_title[i[1]] = i[0]
    
    
    return result_title

In [148]:
result_title = sim_title(title)

# 2.2 Calculate the similarity from tags 

In [150]:
#get the tf-idf vectorizer for tag
tag_vectorizer = TfidfVectorizer()
count_matrix = tag_vectorizer.fit_transform(data['clean_Tags'])

In [169]:
def sim_tags(tags):
    #clean tags 
    tags = clean_tags(tags)
    tags_asSeries = pd.Series(tags)
    #transform the string input into a matrix 
    tags_matrix = tag_vectorizer.transform(tags_asSeries)

    tag_trial1 = tags_matrix.tocsr().todense()
    #calculate the cosine similarity 
    cosine_similarities_tag = cosine_similarity(tag_trial1, count_matrix)
    #get index
    similarity_indices_tag = cosine_similarities_tag[0].argsort()[::-1]
    #ger id and cosine similarity 
    similar_items_tag = [(cosine_similarities_tag[0][i], data['Id'][i]) for i in similarity_indices_tag]

    #put id and cosine similarity in the dict 
    result_tag = {}

    for i in similar_items_tag:
        result_tag[i[1]] = i[0]
        
    return result_tag


#result_tag
result_tag = sim_tags(tag)


# 2.3 Filtering with threshold 

In [162]:
def filter_score(result_title, result_tags):
    #set alpha to be 0.9
    alpha = 0.9

    result_filter = {}
    #get title score and tag score
    for i in result_title.keys():
        title_score = result_title.get(i)
        tag_score = result_tag.get(i)
        #calculate the combined score 
        sim_score = alpha* title_score + (1-alpha)*tag_score
        #put it in the dict with id 
        result_filter[i] = sim_score
    results_ID = []
    #filter with threshold 0.2 
    for i in result_filter.keys():
        if result_filter[i] >= 0.2:
            results_ID.append(i)
    return results_ID     
        
    
result_ID = filter_score(result_title, result_tag)
len(result_ID)

2797

# 2.4  Get Clean Question Body for each ID after filtering

In [163]:
def get_clean_question (result_ID):
    result_text = {}

    for i in result_ID :
        #get filtered and cleaned question body by id 
        text = data.loc[data['Id'] == i]['clean_question'].values[0]
        #get index
        index = int(data.loc[data['Id'] == i].index.values)
        result_text[index] = text

    #transform to series 
    ser = pd.Series(data = result_text)
    return ser

clean_question = get_clean_question (result_ID)
    

# 3 Final Recommendation based on content of question body

In [164]:
#get question body values 
def item(id):
    return data.loc[data['Id'] == id]['QuestionBody'].values

In [214]:
def recommend (question, num):
    #fit a TF-IDF vectorizer 
    tf_question_body = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0,max_features =500)
    #transform to matrixs 
    tfidf_matrix = tf_question_body.fit_transform(clean_question)
    #clean input question
    Question_body_clean = trial_processing(question)
    
    QB_asSeries = pd.Series(Question_body_clean)
    #get transformed matrix 
    QB_matrix = tf_question_body.transform(QB_asSeries)

    QB_trial1 = QB_matrix.tocsr().todense()
    #calculate the similarity score 
    cosine_similarities = cosine_similarity(QB_trial1, tfidf_matrix)
    #return index of 50 question with higest score
    similarity_indices = cosine_similarities[0].argsort()[:-num-2:-1]
    #get highest score 
    sim_score = [cosine_similarities[0][i] for i in similarity_indices]
    #get index for raw dataset
    sim_index = []
    for i in similarity_indices:
        sim_index.append(clean_question.index.values[i])
    
    #get ID 
    sim_id = [(data['Id'][i]) for i in sim_index]
    
    result_id = sim_id[1:num+1]
    print("Input ID:")
    print(sim_id[0])
    print("\n")
    
    print("Recommend ID :")
    print(result_id)
    print("\n")
    
    print("Cosine Similarity Score:")
    print(sim_score[1:])
    print("\n")
    
    print("Recommending " + str(num) + " product similar to : \n" + Question_body_clean)
    print('\n')
    for i in result_id:
        print("Recommend: " + str(item(i)))
        print('\n')
 
    
    
    

In [215]:
recommend(question,5)

Input ID:
53992768


Recommend ID :
[59559519, 54066612, 59641251, 61289172, 58134427]


Cosine Similarity Score:
[0.5545847373713634, 0.5457575415558475, 0.5431425606030436, 0.5420573306987583, 0.5419112769870617]


Recommending 5 product similar to : 
data frame look like following df pd dataframe k one two k checking duplicate get boolean index df duplicated use filter df df duplicated show different result compare df drop duplicate additional row created result one


Recommend: ['<p>I have a following dataframe - </p>\n\n<pre class="lang-py prettyprint-override"><code>  print df\n\n  Name | Role   |\n  Mark | Admin  |\n  Mark | Admin. |\n\n  df = df.drop_duplicates()\n  print df\n\n  Name | Role  |\n  Mark | Admin |\n  Mark | Admin. |\n</code></pre>\n\n<p>I want to ignore any leading or preceding punctuations (full stop in this case) and drop duplicates.</p>\n\n<p>Expected output - </p>\n\n<pre class="lang-py prettyprint-override"><code>  df = df.drop_duplicates()\n  print df\n\n  