# Import Libraries

In [1]:
import inspect
import pandas as pd

from stack.Stack import stack_scrape

In [2]:
params = {	
            'base_url': 'https://stats.stackexchange.com/questions/tagged/',
            'tag': 'python',
            'query_filter': 'Votes',
            'max_pages': 50,
            'pagesize': 50
        }

In [3]:
stack = stack_scrape()

In [4]:
print(inspect.getdoc(stack.scrape_data))

Itera on all selected pages by rotating the function to extract the data from each page and gather it in a json

Paramaters
----------
base_url: url path to all question filter by a tag
                        - stackexchange: https://stats.stackexchange.com/questions/tagged/
                        - stackoverflow: https://stackoverflow.com/questions/tagged/

tag: tag to be filtered (e.g.: 'python', 'r', 'javascript', ...)

query_filter: filter to perform a query ('Newest', 'Active', 'Bounties', 'Unanswered', 'Frequent', Votes')

max_pages: the maximum number of pages to be scraped

pagesize: the number of records per page (the maximum number is 50)


Returns
-------
a DataFrame with the 'Question', 'Number of Votes', 'question-related tags', 'number of responses' and 'number of views' data
 from the records of all selected pages


In [5]:
dfStackExchange = stack.scrape_data(**params)

In [6]:
dfStackExchange.head(20)

Unnamed: 0,question,id,votes,tags,answer,views
0,Python as a statistics workbench,1595,369,r spss stata python,26,133000
1,What is batch size in neural network?,153531,225,neural-networks python terminology keras,5,370000
2,What is an embedding layer in a neural network?,182775,104,machine-learning neural-networks python word-e...,2,81000
3,"What loss function for multi-class, multi-labe...",207794,85,neural-networks python loss-functions keras cr...,6,106000
4,What algorithm should I use to detect anomalie...,152644,75,machine-learning time-series python computatio...,9,84000
5,"How to split the dataset for cross validation,...",95797,73,machine-learning cross-validation python sciki...,1,69000
6,How do R and Python complement each other in d...,238726,56,r python software,9,13000
7,How does one interpret SVM feature weights?,39243,53,svm feature-selection python scikit-learn,5,94000
8,Machine Learning using Python,8817,53,machine-learning python,10,15000
9,Pandas / Statsmodel / Scikit-learn,47913,51,machine-learning python scikit-learn statsmode...,2,29000


# DataFrame of Tags information

In [60]:
def TagsArrayStack(df):
    bagOfWords = {}
    bagOfWordsVotes = {}
    bagOfWordsAnswers = {}
    bagOfWordsViews = {}

    count = 0
    for row in df['tags'].apply(lambda row: row.split()):
        for i in row:
            bagOfWords[i] = bagOfWords.get(i, 0) + 1
            bagOfWordsVotes[i] = bagOfWordsVotes.get(i, 0) + df.loc[count, 'votes']
            bagOfWordsAnswers[i] = bagOfWordsAnswers.get(i, 0) + df.loc[count, 'answer']
            bagOfWordsViews[i] = bagOfWordsViews.get(i, 0) + df.loc[count, 'views']

        count += 1
        
    DfTags = pd.Series(bagOfWords).to_frame().rename(columns={0:'Incidence'}).reset_index()
    DfTags = pd.merge(DfTags, pd.Series(bagOfWordsVotes).to_frame().rename(columns={0:'Votes'}).reset_index(), how='left', left_on='index', right_on='index')
    DfTags = pd.merge(DfTags, pd.Series(bagOfWordsAnswers).to_frame().rename(columns={0:'Answer'}).reset_index(), how='left', left_on='index', right_on='index')
    DfTags = pd.merge(DfTags, pd.Series(bagOfWordsViews).to_frame().rename(columns={0:'Views'}).reset_index(), how='left', left_on='index', right_on='index')
    DfTags = DfTags.rename(columns={'index': 'Tag'})
    DfTags['ViewsPerIncidence'] = DfTags['Views'] / DfTags['Incidence']
    DfTags = DfTags.sort_values('Incidence', ascending=False).head(25)
    DfTags = DfTags.sort_values('ViewsPerIncidence', ascending=False).head(15)
    tagsFilter = ['python', 'r']
    
    DfTags = DfTags[~DfTags['Tag'].isin(tagsFilter)]
    
    return DfTags['Tag'].to_list()

In [61]:
tagsArray = TagsArrayStack(dfStackExchange)

In [62]:
tagsArray

['keras',
 'neural-networks',
 'svm',
 'cross-validation',
 'scikit-learn',
 'deep-learning',
 'pandas',
 'clustering',
 'statsmodels',
 'pca',
 'random-forest',
 'logistic',
 'machine-learning']