In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
from nltk.corpus import stopwords
import glob
from datetime import datetime
import statistics 

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


# Cleaning Data and concatenate three datasets

In [139]:
data = []
for filename in glob.glob('*.csv'):
    data.append(filename)
data

['Parsed Magazine.csv', 'wiki_formal_total_clean.csv', 'COPA_Blogs.csv']

In [140]:
wiki = pd.read_csv(data[1])
# Only take Date, Title, Content and Author into consideration since magazine does not have likes
wiki = wiki[['Date', 'Title', 'Content', 'Author']]
wiki['Date'] = [d[:-3] for d in wiki['Date']]
wiki['Resource'] = 'Wiki'
wiki['Author'] = [a.strip() for a in wiki['Author']]
wiki.head()

Unnamed: 0,Date,Title,Content,Author,Resource
0,2017-05,Are there any service centers that routinely w...,Title says it all.It is often hard for me to g...,ctrease,Wiki
1,2018-09,Door Decals,Looking for right side door stripe decals.As i...,Mark Craig,Wiki
2,2017-07,Nose Wheel Pant and Strut Beating at Oshkosh 2017,"Yesterday, I made my first trek over to Oshkos...",Stuart Simek,Wiki
3,2016-08,Oil Door Support for Summer Time Cooling,"I am based in Colorado Springs, CO and often s...",Michael Samp,Wiki
4,2016-03,Prop Strike Advice Needed,Had a prop strike in my SR-20 this past weeken...,John Shaeffer,Wiki


In [141]:
magazine = pd.read_csv(data[0])
magazine_date_list = list(magazine['Magazine'].unique())
# Get the month of magazine from its name
magazine_date_dict = {magazine_date_list[0]: '2015-01', magazine_date_list[1]: '2019-01', magazine_date_list[2]: '2015-07', magazine_date_list[3]: '2019-04', magazine_date_list[4]: '2019-06', magazine_date_list[5]: '2018-01', magazine_date_list[6]: '2017-09', magazine_date_list[7]: '2016-09', magazine_date_list[8]: '2017-11', magazine_date_list[9]: '2015-05', magazine_date_list[10]: '2015-09', magazine_date_list[11]: '2006-09', magazine_date_list[12]: '2018-06', magazine_date_list[13]: '2019-07', magazine_date_list[14]: '2017-04', magazine_date_list[15]: '2016-04', magazine_date_list[16]: '2018-11', magazine_date_list[17]: '2015-09', magazine_date_list[18]: '2019-03', magazine_date_list[19]: '2006-11', magazine_date_list[20]: '2016-01', magazine_date_list[21]: '2019-05', magazine_date_list[22]: '2016-03', magazine_date_list[23]: '2016-05', magazine_date_list[24]: '2006-07', magazine_date_list[25]: '2017-06', magazine_date_list[26]: '2018-07', magazine_date_list[27]: '2018-09', magazine_date_list[28]: '2017-01', magazine_date_list[29]: '2012-11'}
magazine['Magazine'] = magazine['Magazine'].map(magazine_date_dict)
magazine.columns = ['Date', 'Title', 'Content', 'Author']
magazine['Resource'] = 'Magazine'
magazine.head()

Unnamed: 0,Date,Title,Content,Author,Resource
0,2015-01,President's Column,JANUARY FEBRUARY 20154CIRRUS PILOTAs this is b...,,Magazine
1,2015-01,COPA News,JANUARY FEBRUARY 20156CIRRUS PILOTWith this is...,,Magazine
2,2015-01,Regional News,JANUARY FEBRUARY 201512CIRRUS PILOTby GIL WILL...,GIL WILLIAMSON,Magazine
3,2015-01,Cirrus Perspective,JANUARY FEBRUARY 201518CIRRUS PILOTIts hard to...,,Magazine
4,2015-01,Member Spotlight,JANUARY FEBRUARY 201522CIRRUS PILOTCirrus Pilo...,KIM BLONIGEN,Magazine


In [142]:
blog = pd.read_csv(data[2], encoding = 'unicode_escape')
blog = blog[blog.columns[1:]]
blog = blog[['Date', 'Title', 'Body', 'Author']]
blog.columns = ['Date', 'Title', 'Content', 'Author']
blog['Date'] = [d.split(' ')[0].split('/')[2] + '-' + d.split(' ')[0].split('/')[0] for d in blog['Date']]
blog['Resource'] = 'Blog'
blog.head()

Unnamed: 0,Date,Title,Content,Author,Resource
0,2018-12,"CAPS Deployment Anomaly at Colorado Springs, C...","by RICK BEACH, COPA?? SAFETY CHAIR\nThis unusu...",Rick Beach,Blog
1,2016-1,CAPS: How Low Can You Go? Or How High Do You N...,Every Cirrus aircraft built has a parachute sy...,Rick Beach,Blog
2,2017-9,"""Reach the Unreachable"" safety column in COPA ...",(This article appears as the safety column in ...,Rick Beach,Blog
3,2018-10,??That Sounds Expensive?? -- It Happened to ...,This column will appear in the Nov/Dec 2018 is...,Rick Beach,Blog
4,2017-6,Guest Editor column to Special Safety Issue of...,A Note from the Guest Editor\nThis June 2017 i...,Rick Beach,Blog


In [143]:
pdList = [magazine, wiki, blog]
# Combine three datasets together
df = pd.concat(pdList)
df.reset_index(drop = True, inplace = True)
df.head()

Unnamed: 0,Date,Title,Content,Author,Resource
0,2015-01,President's Column,JANUARY FEBRUARY 20154CIRRUS PILOTAs this is b...,,Magazine
1,2015-01,COPA News,JANUARY FEBRUARY 20156CIRRUS PILOTWith this is...,,Magazine
2,2015-01,Regional News,JANUARY FEBRUARY 201512CIRRUS PILOTby GIL WILL...,GIL WILLIAMSON,Magazine
3,2015-01,Cirrus Perspective,JANUARY FEBRUARY 201518CIRRUS PILOTIts hard to...,,Magazine
4,2015-01,Member Spotlight,JANUARY FEBRUARY 201522CIRRUS PILOTCirrus Pilo...,KIM BLONIGEN,Magazine


# Find Metrics

In [144]:
# Create a model for topic classification based on the dataset we have 
# (It may cause some problems because the model will be applied to the dataset later to give each content a topic.)
# (The problem can be solved when we get the entire dataset since we can take part of the contents as corpus.)

# Get some stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Convert contents to list
data = df.Content.values.tolist()

In [145]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [146]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [147]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [148]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [150]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           passes=10)

# pprint(lda_model.print_topics(num_words = 5))
# Get 20 related topics
doc_lda = lda_model[corpus]

In [165]:
# Put contents into model and assign each content to a topic
cleanlist = data
texts1 = [[word for word in doc.lower().split() if word not in stop_words] for doc in cleanlist]
corpus1 = [id2word.doc2bow(text) for text in texts1]
resultlist = lda_model.get_document_topics(corpus1)

In [154]:
# Build a dictionary containing the index of content and its topic classification
topic_dict = {}
for i, result in enumerate(resultlist):
    temp = 0
    topic = 0
    for j in range(len(result)):
        if result[j][1] > temp:
            temp = result[j][1]
            topic = j
    topic_dict[i] = topic

df['Topic Category'] = [v for k, v in topic_dict.items()]

In [156]:
# Calculate the number of articles written by every author
author_list = list(df['Author'].dropna().unique())
author_dict = {}
for a in author_list:
    author_dict[a] = len(df[df['Author'] == a])

In [203]:
# Assign score to the resource, recency and author of an article
resource_rank = {'Magazine': 3, 'Wiki': 2, 'Blog': 1}
df['Resource Score'] = df['Resource'].map(resource_rank)
df['Recency Score'] = [1 / round(abs(datetime.strptime('2019-12', '%Y-%m') - datetime.strptime(d, '%Y-%m')).days / 31) for d in df['Date']]
df['Author Score'] = df['Author'].map(author_dict)
df['Author Score'] = df['Author Score'].fillna(0)

In [204]:
# Normalize three scores
df['Resource Score'] = (df['Resource Score'] - min(df['Resource Score'])) / (max(df['Resource Score']) - min(df['Resource Score']))
df['Recency Score'] = (df['Recency Score'] - min(df['Recency Score'])) / (max(df['Recency Score']) - min(df['Recency Score']))
df['Author Score'] = (df['Author Score'] - min(df['Author Score'])) / (max(df['Author Score']) - min(df['Author Score']))

In [209]:
df.head()

Unnamed: 0,Date,Title,Content,Author,Resource,Topic Category,Resource Score,Recency Score,Author Score
0,2015-01,President's Column,JANUARY FEBRUARY 20154CIRRUS PILOTAs this is b...,,Magazine,1,1.0,0.044783,0.0
1,2015-01,COPA News,JANUARY FEBRUARY 20156CIRRUS PILOTWith this is...,,Magazine,13,1.0,0.044783,0.0
2,2015-01,Regional News,JANUARY FEBRUARY 201512CIRRUS PILOTby GIL WILL...,GIL WILLIAMSON,Magazine,2,1.0,0.044783,0.217391
3,2015-01,Cirrus Perspective,JANUARY FEBRUARY 201518CIRRUS PILOTIts hard to...,,Magazine,1,1.0,0.044783,0.0
4,2015-01,Member Spotlight,JANUARY FEBRUARY 201522CIRRUS PILOTCirrus Pilo...,KIM BLONIGEN,Magazine,1,1.0,0.044783,0.217391


# Input new data and return searching results

In [206]:
# Search for a term or a sentence
text = ['The engine does not work']
cleanlist = text
texts1 = [[word for word in doc.lower().split() if word not in stop_words] for doc in cleanlist]
corpus1 = [id2word.doc2bow(t) for t in texts1]
result = lda_model.get_document_topics(corpus1)

temp = 0
topic = 0
for i in range(len(result[0])):
    if result[0][i][1] > temp:
        temp = result[0][i][1]
        topic = i

In [207]:
# Filter the result by query's corresponding topic category
# (Or it can be used as a weighted feature as well.)
df_same_topic = df[df['Topic Category'] == topic]
df_same_topic['Total Score'] = df_same_topic['Resource Score'] * 1.5 + df_same_topic['Recency Score'] * 1.5 + df_same_topic['Author Score'] * 0.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [208]:
# Display the searching result
df_same_topic.sort_values(by = 'Total Score', ascending = False)

Unnamed: 0,Date,Title,Content,Author,Resource,Topic Category,Resource Score,Recency Score,Author Score,Total Score
65,2019-06,Training,JULY AUGUST 201954 COPA PilotSummer is here an...,TRIP TAYLOR,Magazine,1,1.0,0.658009,0.217391,2.595709
252,2019-03,Regional News,MARCH 201912 COPA Pilotwhere artists paint dra...,GIL WILLIAMSON,Magazine,1,1.0,0.430014,0.217391,2.253717
254,2019-03,Member Spotlight,MARCH 201922 COPA PilotI know many of you have...,KIM BLONIGEN,Magazine,1,1.0,0.430014,0.217391,2.253717
20,2019-01,Safety Talk,JANUARY FEBRUARY 201940 COPA PilotThis unusual...,RICK BEACH,Magazine,1,1.0,0.347107,0.434783,2.238052
228,2018-11,Safety Talk,fl flThis column reects on an accident with my...,RICK BEACH,Magazine,1,1.0,0.289710,0.434783,2.151957
15,2019-01,COPA News,JANUARY FEBRUARY 20196 COPA PilotAttending soc...,,Magazine,1,1.0,0.347107,0.000000,2.020661
17,2019-01,Cirrus Perspective,JANUARY FEBRUARY 201918 COPA Pilotaircraft. I ...,,Magazine,1,1.0,0.347107,0.000000,2.020661
225,2018-11,Cirrus Perspective,Green Valley Ranch Resort and joined the fun. ...,,Magazine,1,1.0,0.289710,0.000000,1.934565
222,2018-11,President's Column,flMigration 16 just wrapped up as I write thi...,,Magazine,1,1.0,0.289710,0.000000,1.934565
362,2018-07,Regional News,JULY AUGUST 201812 COPA Pilotto ensure a safe ...,GIL WILLIAMSON,Magazine,1,1.0,0.215432,0.217391,1.931843
