## Importing Packages

In [None]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error

import nltk
from nltk.stem import WordNetLemmatizer 

import gensim
from gensim.parsing.preprocessing import STOPWORDS

# nltk.download()
# from google.colab import drive
# drive.mount('/content/drive')

## Getting The Data

In [None]:
transcripts = pd.read_csv("/content/drive/MyDrive/Big Data/mediumdata.csv")
transcripts.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_url,story_url,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs
0,06/01/2020,Interferon Responses Could Explain Susceptibil...,Impaired or delayed antiviral…,148,1,https://medium.com/@ngough-bioserendipity,https://medium.com/swlh/interferon-responses-c...,10,1,['Interferon Responses Could Explain Susceptib...,52,"['What are interferons?', 'Interferons are pro..."
1,06/01/2020,My Answers to the FAANG Product Management and...,Acing the Product Manager and Technical Progra...,520,5,https://medium.com/@drm,https://medium.com/swlh/my-answers-to-the-faan...,7,4,['My Answers to the FAANG Product Management a...,4,"['From time to time, I’ll run through a series..."
2,06/01/2020,Dynamically Import CSS,Lazy load CSS with the help of dynamic import(...,88,1,https://medium.com/@daviddalbusco,https://medium.com/swlh/dynamically-import-css...,6,6,"['Dynamically Import CSS', 'Introduction', 'Dy...",28,['We recently introduced several theming optio...
3,06/01/2020,6 ways to significantly speed up Pandas with a...,The second part of the…,416,0,https://medium.com/@30mb1,https://medium.com/swlh/6-ways-to-significantl...,8,5,['6 ways to significantly speed up Pandas with...,28,"['In previous article, we looked at some simpl..."
4,06/01/2020,Using Hydra to Spray User Passwords,How attackers bypass account lockout when brut...,163,0,https://medium.com/@vickieli,https://medium.com/swlh/using-hydra-to-spray-u...,4,4,"['Using Hydra to Spray User Passwords', 'What ...",27,['Have you heard of a password brute-force att...


In [None]:
transcripts = transcripts[['title', 'subtitle', 'paragraphs', 'reading_time (mins)', 'story_url']]
print(transcripts.shape)
transcripts.head()

(8322, 5)


Unnamed: 0,title,subtitle,paragraphs,reading_time (mins),story_url
0,Interferon Responses Could Explain Susceptibil...,Impaired or delayed antiviral…,"['What are interferons?', 'Interferons are pro...",10,https://medium.com/swlh/interferon-responses-c...
1,My Answers to the FAANG Product Management and...,Acing the Product Manager and Technical Progra...,"['From time to time, I’ll run through a series...",7,https://medium.com/swlh/my-answers-to-the-faan...
2,Dynamically Import CSS,Lazy load CSS with the help of dynamic import(...,['We recently introduced several theming optio...,6,https://medium.com/swlh/dynamically-import-css...
3,6 ways to significantly speed up Pandas with a...,The second part of the…,"['In previous article, we looked at some simpl...",8,https://medium.com/swlh/6-ways-to-significantl...
4,Using Hydra to Spray User Passwords,How attackers bypass account lockout when brut...,['Have you heard of a password brute-force att...,4,https://medium.com/swlh/using-hydra-to-spray-u...


### Counting Most Frequent Words

In [None]:
from collections import Counter
corpus = transcripts['paragraphs'].values.tolist()
my_counter = Counter()
for word in corpus:
    # my_counter.update(word)
    allWords = nltk.tokenize.word_tokenize(word)
    allWordDist = nltk.FreqDist(w.lower() for w in allWords)

    stopwords = nltk.corpus.stopwords.words('english')
    punctuations = string.punctuation
    allWordExceptStopDist = nltk.FreqDist(w.lower() for w in allWords if w not in stopwords and w not in punctuations)

# type(my_counter)
mostCommon= allWordExceptStopDist.most_common(50)
# sorted(my_counter.items(), key=lambda pair: pair[1], reverse=True)
print(mostCommon)

[('problem', 11), ('mentor', 10), ("'the", 9), ('help', 8), ('i', 7), ('one', 7), ('“', 6), ('”', 6), ('learned', 6), ('learn', 6), ('others', 6), ('you', 6), ('rule', 6), ('best', 6), ('mentors', 5), ('people', 5), ('it', 5), ('learning', 5), ('engineering', 5), ('answer', 5), ('the', 4), ("'this", 4), ('engineers', 4), ('never', 4), ('engineer', 4), ('find', 4), ('we', 4), ('work', 4), ('solve', 4), ('also', 4), ('knowledge', 4), ('may', 4), ('good', 4), ('experience', 4), ('something', 4), ('answers', 4), ('’', 4), ('questions', 4), ('person', 4), ('solution', 4), ('instead', 4), ('new', 4), ('speech', 3), ('us', 3), ('matter', 3), ('—', 3), ('listen', 3), ('need', 3), ('single', 3), ('problems', 3)]


## Cleaning The Data

In [None]:
def remove_emojis(text):
    emojis = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return emojis.sub(r'', text)

def clean_text(text):
    ''' Eliminates links, non alphanumerics, and punctuation. Returns lower case text. '''
    # Remove links
    text = re.sub('(?:(?:https?|ftp):\/\/)?[\w/\-?=%.]+\.[\w/\-?=%.]+','', text)

    # Remove non-alphanumerics
    text = re.sub('\w*\d\w*', ' ', text)

    # Remove punctuation and lowercase
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower())

    # Remove newline characters
    text = text.replace('\n', ' ')
    
    return text

def remove_stopwords(text):
    clean_text = []
    for word in text.split(' '):
        if word not in stop_list and (len(word) > 2):
            clean_text.append(word)
    return ' '.join(clean_text)


lemmer = WordNetLemmatizer()
def lem_text(text):
    word_list = []
    for word in text.split(' '):
        word_list.append(lemmer.lemmatize(word))
    return ' '.join(word_list)

In [None]:
# Removing Emojis Of Text
transcripts.paragraphs = transcripts.paragraphs.apply(remove_emojis)
print('Removed Emojis')

# Basic Cleaning Of Text
transcripts.paragraphs = transcripts.paragraphs.apply(clean_text)
print('Basic Cleaning Of Text Done')

# Removing Stopwords
stop_list = STOPWORDS.union(set(['data', 'ai', 'coding', 'developer', 'function', 'object', 'array', 'science', 'programming', 'development', 
                                 'unit', 'based', 'want', 'know', 'learn', "don't", 'things', 'lot', "let's", 'model', 'learned', 'learn', 'learning', 
                                 'work', 'solve', 'also', 'knowledge', 'solution', 'question', 'answer', 'algorithm', 'git', 'file', 'accuracy', 
                                 'people', 'time', 'network', 'layer', 'application', 'repository', 'create', 'package', 'client', 'prediction',
                                 'test', 'classification', 'render', 'dataset', 'training', 'regression', 'directory', 'folder', 'database', 'user']))
transcripts.paragraphs = transcripts.paragraphs.apply(remove_stopwords)
print('Removed Stopwords')

# Lemmatizing The Words
transcripts.paragraphs = transcripts.paragraphs.apply(lem_text)
print('Lemmatized All The Words')

transcripts.head()

Removed Emojis
Basic Cleaning Of Text Done
Removed Stopwords
Lemmatized All The Words


Unnamed: 0,title,subtitle,paragraphs,reading_time (mins),story_url
0,Interferon Responses Could Explain Susceptibil...,Impaired or delayed antiviral…,interferon interferon protein cell response in...,10,https://medium.com/swlh/interferon-responses-c...
1,My Answers to the FAANG Product Management and...,Acing the Product Manager and Technical Progra...,i’ll run series faang interview gauge market v...,7,https://medium.com/swlh/my-answers-to-the-faan...
2,Dynamically Import CSS,Lazy load CSS with the help of dynamic import(...,recently introduced theming option showcase co...,6,https://medium.com/swlh/dynamically-import-css...
3,6 ways to significantly speed up Pandas with a...,The second part of the…,previous article looked simple way speed panda...,8,https://medium.com/swlh/6-ways-to-significantl...
4,Using Hydra to Spray User Passwords,How attackers bypass account lockout when brut...,heard password brute force attack brute force ...,4,https://medium.com/swlh/using-hydra-to-spray-u...


## Modelling

### Making TFIDF Vector For The Content of Article

In [None]:
vectorizer = TfidfVectorizer(stop_words = stop_list, ngram_range = (1, 1))
tfcorpus = vectorizer.fit_transform(transcripts.paragraphs)
tfcorpus.toarray().shape

  'stop_words.' % sorted(inconsistent))


(8322, 72438)

### Function To Display Topics

In [None]:
def display_topics(model, feature_names, no_top_words, no_top_topics, topic_names = None):
    count = 0
    for ix, topic in enumerate(model.components_):
        if count == no_top_topics:
            break
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", (ix + 1))
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i] for i in topic.argsort()[: -no_top_words - 1 : -1]]))

### Making a Model to See Favourable Topics

#### TruncatedSVD

In [None]:
svd = TruncatedSVD(10)
docs_svd = svd.fit_transform(tfcorpus)

display_topics(svd, vectorizer.get_feature_names(), 15, 10)

#### NMF

In [None]:
nmf = NMF(10)
docs_nmf = nmf.fit_transform(tfcorpus)

display_topics(nmf, vectorizer.get_feature_names(), 15, 10)


Topic  1
company, business, employee, job, startup, year, money, life, home, day, office, working, investor, black, founder

Topic  2
code, class, method, javascript, language, type, string, variable, value, python, return, element, use, list, let

Topic  3
value, variable, column, feature, machine, linear, number, input, output, set, probability, matrix, plot, node, tree

Topic  4
writing, writer, write, book, word, reader, day, story, idea, ve, read, life, like, feel, sentence

Topic  5
component, react, state, redux, hook, prop, app, reducer, action, angular, dom, button, element, javascript, html

Topic  6
social, brand, medium, content, facebook, platform, marketing, instagram, tiktok, video, audience, google, post, apple, app

Topic  7
product, customer, business, marketing, market, brand, company, service, design, sale, strategy, need, idea, feature, startup

Topic  8
team, project, manager, software, leader, design, problem, goal, meeting, engineer, member, need, task, organiz

Topics 
- Topic 1 - Business and Startup
- Topic 2 - Fundamental Coding
- Topic 3 - Data Science
- Topic 4 - Literature
- Topic 5 - Web Development
- Topic 6 - Social Media Branding
- Topic 7 - Marketing and Sales
- Topic 8 - Team Dynamics
- Topic 9 - Cloud Development
- Topic 10 - Machine Learning and Deep Learning

In [None]:
# Create topic sum for each article. Later remove all articles with sum 0.
sumOfTopics = pd.DataFrame(np.sum(docs_nmf, axis = 1))

# Turn our docs_nmf array into a data frame
sumEachTopic = pd.DataFrame(data = docs_nmf)
sumEachTopic.head()

# Making Final DataFrame with Topic Scores
final = pd.concat([transcripts, sumEachTopic, sumOfTopics], axis = 1)
final.columns = ['Titles', 'Subtitles', 'Text', 'Reading Time', 'Story URL', 
                 'Business and Startups', 'Fundamental Coding', 'Data Science & Statistics', 'Literature', 'Web Development', 
                 'Social Media & Branding', 'Marketing & Sales', 'Team Dynamics', 'Cloud Development', 'Machine Learning & Deep Learning',
                 'Total Topic Sum']

# Remove articles with topic sum = 0, then drop sum column
final = final[final['Total Topic Sum'] != 0]
final.drop(columns = 'Total Topic Sum', inplace = True)

final.head()

Unnamed: 0,Titles,Subtitles,Text,Reading Time,Story URL,Business and Startups,Fundamental Coding,Data Science & Statistics,Literature,Web Development,Social Media & Branding,Marketing & Sales,Team Dynamics,Cloud Development,Machine Learning & Deep Learning
0,Interferon Responses Could Explain Susceptibil...,Impaired or delayed antiviral…,interferon interferon protein cell response in...,10,https://medium.com/swlh/interferon-responses-c...,0.004894,0.007465,0.01036,0.0,0.0,0.003635,0.0,0.0,0.000435,0.0
1,My Answers to the FAANG Product Management and...,Acing the Product Manager and Technical Progra...,i’ll run series faang interview gauge market v...,7,https://medium.com/swlh/my-answers-to-the-faan...,0.012168,0.006582,0.002763,0.0,0.0,0.018922,0.019762,0.026765,0.0,0.0
2,Dynamically Import CSS,Lazy load CSS with the help of dynamic import(...,recently introduced theming option showcase co...,6,https://medium.com/swlh/dynamically-import-css...,0.0,0.033187,0.0,0.00585,0.093204,0.00032,0.0,0.0051,0.0192,0.006342
3,6 ways to significantly speed up Pandas with a...,The second part of the…,previous article looked simple way speed panda...,8,https://medium.com/swlh/6-ways-to-significantl...,0.002413,0.014669,0.038433,0.002447,0.0,0.0,0.0,0.005677,0.011292,0.000258
4,Using Hydra to Spray User Passwords,How attackers bypass account lockout when brut...,heard password brute force attack brute force ...,4,https://medium.com/swlh/using-hydra-to-spray-u...,0.001123,0.002613,0.002559,0.0,0.0,0.010631,0.0,0.0,0.033858,0.0


In [None]:
# final.to_csv('/content/drive/MyDrive/Big Data/final_data.csv')

## Recommendation System

In [None]:
final = pd.read_csv('/content/drive/MyDrive/Big Data/final_data.csv')
final.head()

Unnamed: 0.1,Unnamed: 0,Titles,Subtitles,Text,Reading Time,Story URL,Business and Startups,Fundamental Coding,Data Science & Statistics,Literature,Web Development,Social Media & Branding,Marketing & Sales,Team Dynamics,Cloud Development,Machine Learning & Deep Learning
0,0,Interferon Responses Could Explain Susceptibil...,Impaired or delayed antiviral…,interferon interferon protein cell response in...,10,https://medium.com/swlh/interferon-responses-c...,0.004894,0.007465,0.01036,0.0,0.0,0.003635,0.0,0.0,0.000435,0.0
1,1,My Answers to the FAANG Product Management and...,Acing the Product Manager and Technical Progra...,i’ll run series faang interview gauge market v...,7,https://medium.com/swlh/my-answers-to-the-faan...,0.012168,0.006582,0.002763,0.0,0.0,0.018922,0.019762,0.026765,0.0,0.0
2,2,Dynamically Import CSS,Lazy load CSS with the help of dynamic import(...,recently introduced theming option showcase co...,6,https://medium.com/swlh/dynamically-import-css...,0.0,0.033187,0.0,0.00585,0.093204,0.00032,0.0,0.0051,0.0192,0.006342
3,3,6 ways to significantly speed up Pandas with a...,The second part of the…,previous article looked simple way speed panda...,8,https://medium.com/swlh/6-ways-to-significantl...,0.002413,0.014669,0.038433,0.002447,0.0,0.0,0.0,0.005677,0.011292,0.000258
4,4,Using Hydra to Spray User Passwords,How attackers bypass account lockout when brut...,heard password brute force attack brute force ...,4,https://medium.com/swlh/using-hydra-to-spray-u...,0.001123,0.002613,0.002559,0.0,0.0,0.010631,0.0,0.0,0.033858,0.0


### Cosine Distance

In [None]:
topic_names = ['Business and Startups', 'Fundamental Coding', 'Data Science & Statistics', 'Literature', 'Web Development', 
               'Social Media & Branding', 'Marketing & Sales', 'Team Dynamics', 'Cloud Development', 'Machine Learning & Deep Learning']
topic_array = np.array(final[topic_names])
norms = np.linalg.norm(topic_array, axis = 1)

def compute_dists(top_vec, topic_array):
    ''' Returns cosine distances for top_vec compared to every article '''
    dots = np.matmul(topic_array, top_vec)
    input_norm = np.linalg.norm(top_vec)
    co_dists = dots / (input_norm * norms)
    return co_dists

def produce_rec(top_vec, topic_array, doc_topic_df, rand = 15):
    '''
    Produces a recommendation based on cosine distance.
    Rand variable controls level of randomness in output recommendation.
    '''
    # Add a bit of randomness to top_vec
    top_vec = top_vec + np.random.rand(10, ) / (np.linalg.norm(top_vec)) * rand
    co_dists = compute_dists(top_vec, topic_array)
    return final.loc[np.argmax(co_dists)]

In [None]:
top_vec = np.array([0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0])

rec = produce_rec(top_vec, topic_array, final)
print(rec['Story URL'])
rec

https://medium.com/swlh/how-to-start-your-journey-as-an-ios-developer-a23b835b1fed?source=collection_archive---------35-----------------------


Unnamed: 0                                                                       8098
Titles                              How to Start Your Swift-iOS Journey as a Beginner
Subtitles                                                                           -
Text                                spoken io maybe told io pretty cool good salar...
Reading Time                                                                        4
Story URL                           https://medium.com/swlh/how-to-start-your-jour...
Business and Startups                                                       0.0167106
Fundamental Coding                                                          0.0231493
Data Science & Statistics                                                           0
Literature                                                                  0.0132692
Web Development                                                                     0
Social Media & Branding                               

### R2 Score

In [None]:
topic_names = ['Business and Startups', 'Fundamental Coding', 'Data Science & Statistics', 'Literature', 'Web Development', 
               'Social Media & Branding', 'Marketing & Sales', 'Team Dynamics', 'Cloud Development', 'Machine Learning & Deep Learning']
topic_array = np.array(final[topic_names])
# norms = np.linalg.norm(topic_array, axis = 1)
sc = MinMaxScaler(feature_range = (0, 1))
norm = sc.fit_transform(topic_array.T).T

top_vec = np.array([0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0])

min = 1000
index = []
for i in range(len(final)):
  y = mean_squared_error(norm[i], top_vec)
  if y < min: 
    index.append(i)
    min = y

print(index)
print('The Closest Article is:')
print(final['Titles'][index[-3:]])
print(final['Story URL'][index[-3:]])
# final.loc[[index[-3:]]]

[0, 2, 7, 15, 25, 92, 137, 143, 144, 280, 455, 497]
The Closest Article is:
280    Block, Function, and Global Scope in JavaScript
455                    Java (Non-primitive data-types)
497                   Features C# has that Java should
Name: Titles, dtype: object
280    https://medium.com/swlh/block-function-and-glo...
455    https://medium.com/swlh/java-non-primitive-dat...
497    https://medium.com/swlh/features-c-has-that-ja...
Name: Story URL, dtype: object


In [None]:
index[-3:]

[4, 12, 75]