# Import Libraries

In [1]:
import inspect
import pandas as pd

from stack.Stack import stack_scrape

In [2]:
params = {	
            'base_url': 'https://stats.stackexchange.com/questions/tagged/',
            'tag': 'python',
            'query_filter': 'Votes',
            'max_pages': 50,
            'pagesize': 50
        }

In [3]:
stack = stack_scrape()
scrape_data = stack.scrape_data(**params)

In [4]:
print(inspect.getdoc(stack.scrape_data))

Itera on all selected pages by rotating the function to extract the data from each page and gather it in a json

Paramaters
----------
base_url: url path to all question filter by a tag
                        - stackexchange: https://stats.stackexchange.com/questions/tagged/
                        - stackoverflow: https://stackoverflow.com/questions/tagged/
                        
tag: tag to be filtered (e.g.: 'python', 'r', 'javascript', ...)

query_filter: filter to perform a query ('Newest', 'Active', 'Bounties', 'Unanswered', 'Frequent', Votes')

max_pages: the maximum number of pages to be scraped

pagesize: the number of records per page (the maximum number is 50)


Returns
-------
a JSON with the 'Question', 'Number of Votes', 'question-related tags', 'number of responses' and 'number of views' data
 from the records of all selected pages


In [13]:
dfStackExchange = pd.DataFrame(scrape_data, columns=scrape_data[0].keys())

In [14]:
dfStackExchange.head(20)

Unnamed: 0,question,id,votes,tags,answer,views
0,Python as a statistics workbench,1595,369,r spss stata python,26,133000
1,What is batch size in neural network?,153531,225,neural-networks python terminology keras,5,370000
2,What is an embedding layer in a neural network?,182775,104,machine-learning neural-networks python word-e...,2,81000
3,"What loss function for multi-class, multi-labe...",207794,85,neural-networks python loss-functions keras cr...,6,106000
4,What algorithm should I use to detect anomalie...,152644,75,machine-learning time-series python computatio...,9,84000
5,"How to split the dataset for cross validation,...",95797,73,machine-learning cross-validation python sciki...,1,69000
6,How do R and Python complement each other in d...,238726,56,r python software,9,13000
7,How does one interpret SVM feature weights?,39243,53,svm feature-selection python scikit-learn,5,94000
8,Machine Learning using Python,8817,53,machine-learning python,10,15000
9,Pandas / Statsmodel / Scikit-learn,47913,51,machine-learning python scikit-learn statsmode...,2,29000


In [7]:
bagOfWords = {}
for row in dfStackExchange['tags'].apply(lambda row: row.split()):
    for i in row:
        bagOfWords[i] = bagOfWords.get(i, 0) + 1 

In [8]:
bagOfWords

{'r': 249,
 'spss': 7,
 'stata': 4,
 'python': 2500,
 'neural-networks': 150,
 'terminology': 3,
 'keras': 73,
 'machine-learning': 568,
 'word-embeddings': 5,
 'loss-functions': 18,
 'cross-entropy': 3,
 'time-series': 225,
 'computational-statistics': 10,
 'anomaly-detection': 11,
 'cross-validation': 59,
 'scikit-learn': 428,
 'software': 12,
 'svm': 71,
 'feature-selection': 47,
 'statsmodels': 101,
 'pandas': 59,
 'deep-learning': 61,
 'survival': 28,
 'mortality': 1,
 'regression': 308,
 'logistic': 80,
 'cart': 18,
 'accuracy': 12,
 'precision-recall': 12,
 'curve-fitting': 24,
 'predictive-models': 33,
 'mape': 5,
 'pca': 59,
 'tensorflow': 52,
 'autoencoders': 11,
 'outliers': 27,
 'change-point': 13,
 'classification': 130,
 'conditional-random-field': 8,
 'libsvm': 5,
 'c++': 1,
 'dimensionality-reduction': 33,
 'discriminant-analysis': 8,
 'svd': 15,
 'clustering': 69,
 'k-means': 22,
 'circular-statistics': 6,
 'multicollinearity': 7,
 'standard-error': 9,
 'regression-coe

In [9]:
tagQuery = pd.Series(bagOfWords).to_frame().rename(columns={0:'Quantity'})

In [10]:
tagFilter = ['python', 'r']
querryArray = tagQuery[~tagQuery.index.isin(tagFilter)].sort_values('Quantity', ascending=False).iloc[:25,:]

In [11]:
querryArray = querryArray.index.to_list()

In [12]:
querryArray

['machine-learning',
 'scikit-learn',
 'regression',
 'time-series',
 'neural-networks',
 'classification',
 'statsmodels',
 'random-forest',
 'scipy',
 'logistic',
 'data-visualization',
 'distributions',
 'bayesian',
 'keras',
 'svm',
 'clustering',
 'correlation',
 'probability',
 'deep-learning',
 'forecasting',
 'pca',
 'cross-validation',
 'pandas',
 'numpy',
 'arima']

## Convert Dtypes to int

In [16]:
dfStackExchange['votes'] = dfStackExchange['votes'].astype(int)
dfStackExchange['answer'] = dfStackExchange['answer'].astype(int)
dfStackExchange['views'] = dfStackExchange['views'].astype(int)

## Votes

In [23]:
bagOfWordsVotes = {}
count = 0
for row in dfStackExchange['tags'].apply(lambda row: row.split()):
    for i in row:
        if i in querryArray:
            bagOfWordsVotes[i] = bagOfWordsVotes.get(i, 0) + dfStackExchange.loc[count, 'votes']
    count += 1

In [24]:
bagOfWordsVotes

{'neural-networks': 741,
 'keras': 462,
 'machine-learning': 1745,
 'time-series': 686,
 'cross-validation': 235,
 'scikit-learn': 1481,
 'svm': 234,
 'statsmodels': 311,
 'pandas': 187,
 'deep-learning': 201,
 'regression': 707,
 'logistic': 229,
 'pca': 238,
 'classification': 326,
 'clustering': 195,
 'forecasting': 148,
 'correlation': 147,
 'bayesian': 218,
 'probability': 134,
 'scipy': 201,
 'random-forest': 281,
 'data-visualization': 238,
 'distributions': 179,
 'numpy': 126,
 'arima': 104}

## Answers

In [26]:
bagOfWordsAnswers = {}
count = 0
for row in dfStackExchange['tags'].apply(lambda row: row.split()):
    for i in row:
        if i in querryArray:
            bagOfWordsAnswers[i] = bagOfWordsAnswers.get(i, 0) + dfStackExchange.loc[count, 'answer']
    count += 1

In [27]:
bagOfWordsAnswers

{'neural-networks': 155,
 'keras': 82,
 'machine-learning': 627,
 'time-series': 234,
 'cross-validation': 65,
 'scikit-learn': 460,
 'svm': 86,
 'statsmodels': 118,
 'pandas': 64,
 'deep-learning': 70,
 'regression': 303,
 'logistic': 84,
 'pca': 69,
 'classification': 140,
 'clustering': 78,
 'forecasting': 60,
 'correlation': 53,
 'bayesian': 74,
 'probability': 61,
 'scipy': 99,
 'random-forest': 100,
 'data-visualization': 99,
 'distributions': 91,
 'numpy': 67,
 'arima': 53}

## Views

In [29]:
bagOfWordsViews = {}
count = 0
for row in dfStackExchange['tags'].apply(lambda row: row.split()):
    for i in row:
        if i in querryArray:
            bagOfWordsViews[i] = bagOfWordsViews.get(i, 0) + dfStackExchange.loc[count, 'views']
    count += 1

In [30]:
bagOfWordsViews

{'neural-networks': 821472,
 'keras': 601861,
 'machine-learning': 1473521,
 'time-series': 430960,
 'cross-validation': 241735,
 'scikit-learn': 1715091,
 'svm': 372995,
 'statsmodels': 358799,
 'pandas': 214464,
 'deep-learning': 231158,
 'regression': 572457,
 'logistic': 223052,
 'pca': 191383,
 'classification': 298242,
 'clustering': 249296,
 'forecasting': 95332,
 'correlation': 87567,
 'bayesian': 88631,
 'probability': 84842,
 'scipy': 235168,
 'random-forest': 282101,
 'data-visualization': 187405,
 'distributions': 116347,
 'numpy': 154981,
 'arima': 72649}