# POC Model

__Install packages__

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import glob
from datetime import datetime
import statistics
import math

### Data Sources Integration

In [2]:
data = []
for filename in glob.glob('*.csv'):
    data.append(filename)
data

['Parsed Magazine.csv', 'wiki_formal_total_clean.csv', 'COPA_Blogs.csv']

In [3]:
wiki = pd.read_csv(data[1])
wiki = wiki[['Date', 'Title', 'Content', 'Author','Like']]
wiki['Author'] = [a.strip() for a in wiki['Author']]
wiki['Comment'] = 0
wiki['Resource'] = 'Wiki'

In [4]:
magazine = pd.read_csv(data[0])
magazine_date_list = list(magazine['Magazine'].unique())
# Get the month of magazine from its name
# Since the the COPA magazines are released monthly, so it only has year and month
# in its date column. In this case we just assume all magazines were released on the 
# first day of that month.
magazine_date_dict = {magazine_date_list[0]: '2015-01-01', magazine_date_list[1]: '2019-01-01',
                      magazine_date_list[2]: '2015-07-01', magazine_date_list[3]: '2019-04-01',
                      magazine_date_list[4]: '2019-06-01', magazine_date_list[5]: '2018-01-01',
                      magazine_date_list[6]: '2017-09-01', magazine_date_list[7]: '2016-09-01',
                      magazine_date_list[8]: '2017-11-01', magazine_date_list[9]: '2015-05-01',
                      magazine_date_list[10]: '2015-09-01', magazine_date_list[11]: '2006-09-01', 
                      magazine_date_list[12]: '2018-06-01', magazine_date_list[13]: '2019-07-01',
                      magazine_date_list[14]: '2017-04-01', magazine_date_list[15]: '2016-04-01',
                      magazine_date_list[16]: '2018-11-01', magazine_date_list[17]: '2015-09-01',
                      magazine_date_list[18]: '2019-03-01', magazine_date_list[19]: '2006-11-01',
                      magazine_date_list[20]: '2016-01-01', magazine_date_list[21]: '2019-05-01',
                      magazine_date_list[22]: '2016-03-01', magazine_date_list[23]: '2016-05-01',
                      magazine_date_list[24]: '2006-07-01', magazine_date_list[25]: '2017-06-01',
                      magazine_date_list[26]: '2018-07-01', magazine_date_list[27]: '2018-09-01',
                      magazine_date_list[28]: '2017-01-01', magazine_date_list[29]: '2012-11-01'}
magazine['Magazine'] = magazine['Magazine'].map(magazine_date_dict)
magazine.columns = ['Date', 'Title', 'Content', 'Author']
magazine['Like'] = 0
magazine['Comment'] = 0
magazine['Resource'] = 'Magazine'

In [5]:
blog = pd.read_csv(data[2], encoding = 'unicode_escape')
blog = blog[['Date', 'Title', 'Body', 'Author', 'Like','Comment']]
blog.columns = ['Date', 'Title', 'Content', 'Author', 'Like','Comment']
blog['Date'] = [d.split(' ')[0].split('/')[2] + '-' + d.split(' ')[0].split('/')[0] for d in blog['Date']]
blog['Date'] = blog['Date'].map(lambda x: x + '-01')
blog['Resource'] = 'Blog'

In [6]:
pdList = [magazine, wiki, blog]
# Combine three datasets together
df = pd.concat(pdList)
df.reset_index(drop = True, inplace = True)

In [7]:
#Ranking the source, blog is the most important source so have 3
raw_data = {'Resource': ['Wiki','Blog','Magazine'], 'SourceScore': [1, 2, 3]}
Source_tb = pd.DataFrame(raw_data, columns = ['Resource', 'SourceScore'])
#add source score to the main table
df = pd.merge(df, Source_tb, left_on = 'Resource', right_on = 'Resource')

# Generate Attributes

__Generate Recency Score__

In [8]:
def  Calculate_RecencyScore(date):
    '''
    Recency Rate = log( 1 + 1/(days between the post date and current date + 1))
    '''
    date_datetime = datetime.strptime(date, '%Y-%m-%d').date()
    rececncy_rate = math.log10(1/(((datetime.date(datetime.now()))-date_datetime).days+1)+1)
    return rececncy_rate
    

df['RecencyRate'] = df['Date'].map(lambda y: Calculate_RecencyScore(y))

__Generate Author Score__

In [9]:
def Calculate_AuthorScore(author):
    '''
    AuthorScore = Number of posts of specified content source * pre-determined weight
    '''
    author_score = df[df['Author'] == author]['SourceScore'].sum()
    return author_score

author_list = list(df['Author'].unique())
df['AuthorScore'] = df['Author'].map(lambda x: Calculate_AuthorScore(x), author_list)

__Normalization__

In [10]:
def Normalization(col):
    nomolized_col = (df[col] - df[col].mean()) / df.loc[:, col].std()
    return nomolized_col

df['RecencyRate'] = Normalization('RecencyRate')
df['AuthorScore'] = Normalization('AuthorScore')

In [11]:
df.head()

Unnamed: 0,Date,Title,Content,Author,Like,Comment,Resource,SourceScore,RecencyRate,AuthorScore
0,2015-01-01,President's Column,JANUARY FEBRUARY 20154CIRRUS PILOTAs this is b...,,0,0,Magazine,3,-0.465959,-1.209951
1,2015-01-01,COPA News,JANUARY FEBRUARY 20156CIRRUS PILOTWith this is...,,0,0,Magazine,3,-0.465959,-1.209951
2,2015-01-01,Regional News,JANUARY FEBRUARY 201512CIRRUS PILOTby GIL WILL...,GIL WILLIAMSON,0,0,Magazine,3,-0.465959,0.21105
3,2015-01-01,Cirrus Perspective,JANUARY FEBRUARY 201518CIRRUS PILOTIts hard to...,,0,0,Magazine,3,-0.465959,-1.209951
4,2015-01-01,Member Spotlight,JANUARY FEBRUARY 201522CIRRUS PILOTCirrus Pilo...,KIM BLONIGEN,0,0,Magazine,3,-0.465959,0.21105
