In [None]:
# Item Based Collaboratibe Filtering
# Using Microsoft Academic Graph Dataset

In [None]:
#1st Part : Data Import, Cleaning and Feature Parsing

In [None]:
# Naive Approach:
# Import + filter data
import pandas as pd
model_df = pd.read_json('data/mag_papers_0/mag_subset20K.txt', lines = True)
print(model_df.shape)
print(model_df.columns)

# Filter out non-Ensglish articles and focus on a few variables
model_df = model_df[model_df.lang == 'en'].drop_duplicates(subset='title', keep='first').drop(['doc_type', 'doi', 'id', 'issue'
                                                                                              , 'lang', 'n_citation', 'page_end',
                                                                                              'page_start', 'publisher', 
                                                                                              'references', 'url', 'venue', 
                                                                                              'volume'], axis=1)
print(model_df.shape)

# Transforming Feature arrays

# Collaborative filtering stage 1: Build Item feature matrix
unique_fos = sorted(list({feature for paper_row in model_df.fos.fillna('0') for feature in paper_row}))

unique_year = sorted(model_df['year'].astype('str').unique())

def feature_array(x, var, unique_array):
    row_dict = {}
    for i in x.index:
        var_dict = {}
        for j in range(len(unique_array)):
            if type(x[i]) is list:
                if unique_array[j] in x[i]:
                    var_dict.update({var + '_' + unique_array[j]:1})
                else:
                    var_dict.update({var + '_' + unique_array[j]:0})
            else:
                if unique_array[j] == str(x[i]):
                    var_dict.update({var + '_' + unique_array[j]:1})
                else:
                    var_dict.update({var + '_' + unique_array[j]:0})
        row_dict.update({i: var_dict})
    feature_df = pd.DataFrame.from_dict(row_dict, dtype='str').T
    return feature_df

year_features = feature_array(model_df['year'], unique_year)
fos_features = feature_array(model_df['fos'], unique_fos)
first_features = fos_features.join(year_features).T

from sys import getsizeof
print('Size of first feature array: ', getsizeof(first_features))

# Collaborative filtering stage 2: Search for similar items
from scipy.spatial.distance import cosine

def item_collab_filter(features_df):
    item_similarities = pd.DataFrame(index=features_df.columns, columns=features_df.columns)
    for i in features_df.columns:
        for j in features_df.columns:
            item_similarities.loc[i][j] = 1- cosine(features_df[i], features_df[j])
    return item_similarities
first_items = item_collab_filter(first_features.loc[:, 0:1000])

# Heatmap for paper recommendations
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
sns.set()
ax = sns.heatmap(first_items.fillna(0), vmin=0, vmax=1, cmap = "YlGnBu", xticklabels=250, yticklabels=250)
ax.tick_params(labelsize=12)
# Though not much success after this, but we have many more features to work at

def paper_recommender(paper_ix, items_df):
    print('Based on the paper: \n Index = ', paper_ix)
    print(model_df.iloc[paper_ix])
    top_results = items_df.loc[paper_ix].sort_values(ascending=False).head(4)
    print('\nTop three results: ')
    order = 1
    for i in top_results.index.tolist()[-3:]:
        print(order, '. Paper index = ', i)
        print('Similarity Score: ', top_results[i])
        print(model_df.iloc[i], '\n')
        if order <5:
            order += 1

paper_recommender(2, first_items)

In [None]:
# Second approach: Take 2
# Fixed-width binning + dummy coding (part 1)
print("Year Spread: ", model_df['year'].min(), " - ", model_df['year'].max())
print("Quantile Spread:\n", model_df['year'].quantile([0.25, 0.5, 0.75]))

# Plot years to see the distribution
fig, ax = plt.subplots()
model_df['year'].hist(ax=ax, bins=model_df['year'].max() - model_df['year'].min())
ax.tick_params(labelsize=12)
ax.set_xlabel('Year Count', fontsize=12)
ax.set_ylabel('Occurence', fontsize=12)
# Graph shows that this is an excellent candidate for binning

In [None]:
# Fixed-width binning + dummy coding (part 2)
# binning here (by 10 years) reduces the year feature space from 156 to 19
bins = int(round((model_df['year'].max() - model_df['year'].min())/10))
temp_df = pd.DataFrame(index=model_df.index)
temp_df['yearBinned'] = pd.cut(model_df['year'].tolist(), bins, precision=0)
X_yrs = pd.get_dummies(temp_df['yearBinned'])
print(X_yrs.columns.categories)

# Plot
fig, ax = plt.subplots()
X_yrs.sum().plot.bar(ax=ax)
ax.tick_params(labelsize=8)
ax.set_xlabel('Binned Years', fontsize=12)
ax.set_ylabel('Counts', fontsize=12)

# Now going to the fields of study field
# Converting bag-of-phrases pd.Series ti Numpy Sparse array
X_fos = fos_features.values

# We can see how this will make a difference in the future by looking at the size of each
print('Our panda Series, in bytes: ', getsizeof(fos_features))
print('Our hashed numpy array, in bytes: ', getsizeof(X_fos))

In [None]:
# Collaborative Filtering stages 1+2: Build item feature matrix, search for similar items
second_features = np.append(X_fos, X_yrs, axis = 1)
print("The power of feature engineering saves us, in bytes: ", getsizeof(first_features) - getsizeof(second_features))

from sklearn.metrics.pairwise import cosine_similarity

def piped_collab_filter(features_matrix, index, top_n):
    item_similarities = 1 - cosine_similarity(features_matrix[index:index+1], features_matrix).flatten()
    related_indices = [i for i in item_similarities.argsort()[::-1] if i != index]
    return [(index, item_similarities[index]) for index in related_indices][0:top_n]

In [None]:
# Item-based collaborative filtering recommendations: Take 2
def paper_recommender(items_df, paper_ix, top_n):
    if paper_ix in model_df.index:
        print('Based on the paper: ')
        print('Paper index = ', model_df.loc[paper_ix].name)
        print('Title: ', model_df.loc[paper_ix]['title'])
        print('FOS: ', model_df.loc[paper_ix]['fos'])
        print('Year: ', model_df.loc[paper_ix]['year'])
        print('Abstract: ', model_df.loc[paper_ix]['abstract'])
        print('Authors: ', model_df.loc[paper_ix]['authors'], '\n')
        # Define the location index for the DataFrame index requested
        array_ix = model_df.index.get_loc(paper_ix)
        top_results = piped_collab_filter(items_df, array_ix, top_n)
        print('\nTop ', top_n, 'results: ')
        order = 1
        for i in range(len(top_results)):
            print(order, '. Paper index = ', model_df.iloc[top_results[i][0]].name)
            print('Similarity score: ', top_results[i][1])
            print('Title: ', model_df.iloc[top_results[i][0]]['title'])
            print('FOS: ', model_df.iloc[top_results[i][0]]['fos'])
            print('Year: ', model_df.iloc[top_results[i][0]]['year'])
            print('Abstract: ', model_df.iloc[top_results[i][0]]['abstract'])
            print('Authors: ', model_df.iloc[top_results[i][0]]['authors'], '\n')
            if order < top_n: 
                order += 1
            else:
                print('Whoops! Choose another paper. Try something from here: \n', model_df.index[100:200])

paper_recommender(second_features, 2, 3)

In [None]:
# Academic Paper Recommender: Take 3
# We will take now abstract -> tf-idf as well as authors into account, increasing features to find better similarity measure
# Stopwords + tf-idf

# need to fill Nan for sklearn use in future
filled_df = model_df.fillna('None')

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_abstract = vectorizer.fit_transform(filled_df['abstract'])
third_features = np.append(second_features, X_abstract.toarray(), axis=1)

# For authors
# One hot encoding using scikit-learn's DictVectorizer
authors_list = []

for row in filled_df.authors.itertuples():
    # Create a dictionary from each Series index
    if type(row.authors) is str:
        y = {'None': row.Index}
    if type(row.authors) is list:
        y = dict.fromkeys(row.authors[0].values(), row.Index)
    authors_list.append(y)
print(authors_list[0:5])

from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = authors_list
X_authors = v.fit_transform(D)
fourth_features = np.append(third_features, X_authors, axis=1)

# Seeing the recommender now
paper_recommender(fourth_features, 2, 3)
# This is the best till here, more possibilities after this...