In [1]:
import pandas as pd
import json
import requests

In [2]:
def remove_duplicate_rows(df, column_name_to_filter):
    grouped = df.groupby(column_name_to_filter)
    index = [gp_keys[0] for gp_keys in grouped.groups.values()]
    return df.reindex(index)

## Formatting output from "Session Buddy"

In [40]:
with open('data.json') as f:
    string = ""
    for line in f:
        string += line
        
data = json.loads(string)
df = pd.DataFrame(data["tabs"])

In [43]:
output_df = df[['title','url']]
output_df.insert( 2,'user', "dataradar")
output_df.insert( 3,'tags',"")
output_df.insert( 4,'twitter',True)
output_df.insert( 5,'auto_format',False)

output_df = remove_duplicate_rows(output_df, 'url')

In [44]:
output_df.to_json("output.json", orient="records", force_ascii=False)

## Check Uniqueness in consolidated.json File

In [132]:
with open('consolidated.json') as f:
    string = ""
    for line in f:
        string += line
        
data = json.loads(string)
df = pd.DataFrame(data)

df = remove_duplicate_rows(df, 'url')

In [133]:
api_route = "https://www.feedcrunch.io/api/1.0/authenticated/get/article/exists/"

apikey_local = "91ce86bd-ae46-4250-b599-727c1970b609"
apikey_fc    = "4377190f-24a5-4ef7-a761-a877bf6218e3"

api_url = api_route + apikey_fc + "/"

def check_link_exists(url, user):
    r = requests.get(api_url+'?link='+url+'&posting_user='+user)
    try:
        return r.json()["exists"]
    except:
        return False

In [134]:
lines_2_delete = []

for index in range(df.shape[0]):
    
    url = df.iloc[index]['url']
    user = df.iloc[index]['user']
    
    if check_link_exists(url, user):
        lines_2_delete.append(index)
        print 'Duplicate Found: ' + url + " && user: " + user

In [135]:
df.drop(df.index[lines_2_delete], inplace=True)

df = df.sort_values(by=['url', 'title'], ascending=[True, True])

In [138]:
df.to_json("consolidated.json", orient="records", force_ascii=False)

## Describe Data

In [45]:
with open('consolidated.json') as f:
    string = ""
    for line in f:
        string += line
        
data = json.loads(string)
df = pd.DataFrame(data)

print "There are " + str(df.shape[0]) + " articles in dataset"
df = remove_duplicate_rows(df, 'url')
print "There are " + str(df.shape[0]) + " articles in dataset without duplicates"

There are 518 articles in dataset
There are 518 articles in dataset without duplicates


## Sorting Data

In [37]:
with open('consolidated.json') as f:
    string = ""
    for line in f:
        string += line
        
data = json.loads(string)
df = pd.DataFrame(data)

df = remove_duplicate_rows(df, 'url') 
df = df.sort_values(by=['url', 'title'], ascending=[True, True])

df.to_json("consolidated.json", orient="records", force_ascii=False)
df

Unnamed: 0,auto_format,title,twitter,url,user,tags
0,False,Getting Tensorflow to work with GPU NVidia GTX...,True,http://abhay.harpale.net/blog/machine-learning...,dataradar,
1,False,"Deep Learning - What, Why and Applications",True,http://aiehive.com/deep-learning-applications/,dataradar,
2,False,Normal and Negative Binomial Distributions,True,http://allendowney.blogspot.fr/2016/05/binomia...,dataradar,
3,False,Probability is hard - Part 3,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar,
4,False,Probability is hard - Part 2,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar,
5,False,Probability is hard - Part 1,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar,
6,False,What is a distribution?,True,http://allendowney.blogspot.fr/2016/06/what-is...,dataradar,
7,False,Twitter sentiment analysis with R,True,http://analyzecore.com/2014/04/28/twitter-sent...,dataradar,
8,False,Twitter sentiment analysis based on affective ...,True,http://analyzecore.com/2014/05/11/twitter-sent...,dataradar,
9,False,Cohort analysis with R - Layer-cake graph - Pa...,True,http://analyzecore.com/2014/05/31/cohort-analy...,dataradar,


## Hashtag Automation

In [139]:
max_hashtags = 5

def hashtags_free_number(df, index):
    
    tags = df.iloc[index]['tags']
        
    str_list = tags.split(",")
    str_list = filter(None, str_list) # fastest
    
    return max_hashtags - len(str_list)

In [140]:
def check_substr(input_str, keyword):
    tmp_str = str.lower(input_str)
    tmp_str = str.replace(tmp_str, ' ', '')
    tmp_str = str.replace(tmp_str, '-', '')
    tmp_str = str.replace(tmp_str, '_', '')
    tmp_str = str.replace(tmp_str, '/', '')
    tmp_str = str.replace(tmp_str, '#', '')
    tmp_str = str.replace(tmp_str, '?', '')
    
    tmp_keyword = str.lower(keyword)
    tmp_keyword = str.replace(tmp_keyword, ' ', '')
    tmp_keyword = str.replace(tmp_keyword, '-', '')
    tmp_keyword = str.replace(tmp_keyword, '_', '')
    tmp_keyword = str.replace(tmp_keyword, '/', '')
    tmp_keyword = str.replace(tmp_keyword, '#', '')
    tmp_keyword = str.replace(tmp_keyword, '?', '')
    
    if tmp_keyword in tmp_str:
        return True
    else:
        return False

In [295]:
def check_pattern(df, index, pattern):
    
    if check_substr(df.iloc[index]['title'].encode("utf-8"), pattern):
        return True
    else:
        return False

In [296]:
def set_keyword(df, index, keyword_list, pattern = ""):
     
    if not check_pattern(df, index, pattern):
        return False
        
    tags = df.iloc[index]['tags']
        
    str_list = tags.split(",")
    str_list = filter(None, str_list) # fastest
    
    str_list = [s.encode('ascii') for s in str_list] # unicode to string
    
    for keyword in keyword_list:
        if len(str_list) >= max_hashtags:
            break
        if key not in str_list:
            str_list.append(keyword)
        
    df.set_value(index, "tags", ",".join(str_list))
    return max_hashtags - len(str_list)

In [297]:
def write_stats(df):
    data = []
    total_posts = df.shape[0]
    print "Posts Full = " + str(posts_full)+"/"+str(df.shape[0])

    for i in range (100):
        data.append(0)

    for index in range(df.shape[0]):
        tags = df.iloc[index]['tags']

        str_list = tags.split(",")
        str_list = filter(None, str_list) # fastest

        data[len(str_list)] += 1

    for i in range (100):
        if data[i] != 0:
            print "Number of rows with " + str(i) + " Hashtags : " + str(data[i]) + "/" + str(total_posts)

In [350]:
##################### LOADING JSON ##################################
with open('consolidated.json') as f:
    string = ""
    for line in f:
        string += line
        
data = json.loads(string)
df = pd.DataFrame(data)

df = remove_duplicate_rows(df, 'url') 
df = df.sort_values(by=['url', 'title'], ascending=[True, True])

###################### PARSING JSON #################################
posts_full = 0

for index in range(df.shape[0]):
    max_iteration = hashtags_free_number(df, index)
    
    if max_iteration > 0:
        
        for pattern, keywords_list in keywords.iteritems():  
            tmp_rslt = set_keyword(df, index, keywords_list, pattern)
            
            if tmp_rslt is not False:
                max_iteration = tmp_rslt
            
                if max_iteration <= 0:
                    break
        
        if max_iteration > 0 and df.iloc[index]['user'] == "dataradar":
            
            Last_Keywords = ['Data', 'DataScience','AI']            
 
            tmp_rslt = set_keyword(df, index, Last_Keywords, "")

            if tmp_rslt is not False:
                max_iteration = tmp_rslt
                        
        if max_iteration <= 0:
            posts_full += 1
        
    else:
        posts_full += 1

write_stats(df)

Posts Full = 239/518
Number of rows with 0 Hashtags : 62/518
Number of rows with 1 Hashtags : 28/518
Number of rows with 2 Hashtags : 14/518
Number of rows with 3 Hashtags : 127/518
Number of rows with 4 Hashtags : 48/518
Number of rows with 5 Hashtags : 239/518


In [352]:
df.to_json("consolidated.json", orient="records", force_ascii=False)
df

Unnamed: 0,auto_format,tags,title,twitter,url,user
0,False,"GPU,NVidia,DeepLearning,MachineLearning,Tensor...",Getting Tensorflow to work with GPU NVidia GTX...,True,http://abhay.harpale.net/blog/machine-learning...,dataradar
1,False,"DeepLearning,MachineLearning,NeuralNet,NeuralN...","Deep Learning - What, Why and Applications",True,http://aiehive.com/deep-learning-applications/,dataradar
2,False,"Statistic,Probability,Distribution,Data,DataSc...",Normal and Negative Binomial Distributions,True,http://allendowney.blogspot.fr/2016/05/binomia...,dataradar
3,False,"Probability,Statistic,Data,DataScience,AI",Probability is hard - Part 3,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar
4,False,"Probability,Statistic,Data,DataScience,AI",Probability is hard - Part 2,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar
5,False,"Probability,Statistic,Data,DataScience,AI",Probability is hard - Part 1,True,http://allendowney.blogspot.fr/2016/05/probabi...,dataradar
6,False,"Statistic,Probability,Distribution,Data,DataSc...",What is a distribution?,True,http://allendowney.blogspot.fr/2016/06/what-is...,dataradar
7,False,"SentimentAnalysis,NLP,MachineLearning,Data,Dat...",Twitter sentiment analysis with R,True,http://analyzecore.com/2014/04/28/twitter-sent...,dataradar
8,False,"SentimentAnalysis,NLP,MachineLearning,Data,Dat...",Twitter sentiment analysis based on affective ...,True,http://analyzecore.com/2014/05/11/twitter-sent...,dataradar
9,False,"PHP,WebApp,Data,DataScience,AI",Cohort analysis with R - Layer-cake graph - Pa...,True,http://analyzecore.com/2014/05/31/cohort-analy...,dataradar


In [349]:
keywords = {
    # DataRadar Oriented Tags 
    'Accuracy': ['Accuracy'],
    'Adversa': ['GAN', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'Analytic': ['Analytic'],
    'Arxiv': ['Arxiv', 'Research'],
    'AutoEncoder': ['AE', 'AutoEncoder', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'Bayes': ['Bayes', 'MachineLearning', 'Statistic'],
    'BigData': ['BigData'],
    'Boosting': ['Boosting', 'MachineLearning'],
    'CNN': ['CNN', 'ConvNet', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'CUDA': ['CUDA', 'GPU', 'NVidia'],
    'Caffe': ['Caffe', 'DeepLearning', 'MachineLearning', 'NeuralNet', 'Library'],
    'Cassandra': ['Cassandra', 'Apache', 'BigData'],
    'Clustering': ['Clustering', 'MachineLearning'],
    'ComputerVision': ['ComputerVision', 'MachineLearning'],
    'ConvNet': ['CNN', 'ConvNet', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'Convol': ['CNN', 'ConvNet', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'CrossValidation': ['CrossValidation', "Statistic", 'MachineLearning'],
    'CuDNN': ['CuDNN', 'CUDA', 'GPU', 'NVidia'],
    'DNN': ['DNN', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'DSSTNE': ['Amazon', 'DSSTNE', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'DataAnalysis': ['DataAnalysis', 'MachineLearning', 'Statistic'],
    'DataLake': ['DataLake', 'BigData'],
    'DataStructure': ['DataStructure'],
    'DataViz': ['DataViz'],
    'Dataset': ['Dataset'],
    'DeepDream': ['DeepDream', 'Google', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'DeepLearning': ['DeepLearning','MachineLearning', 'NeuralNet', 'NeuralNet', 'Library'],
    'Distribution': ['Statistic', 'Probability', 'Distribution'],
    'ElasticSearch': ['ElasticSearch', 'BigData'],
    'EnsembleLearning': ['EnsembleLearning', 'MachineLearning'],
    'EnsembleModel': ['EnsembleLearning', 'MachineLearning'],
    'FPGA': ['FPGA'],
    'FaceDetection': ['FaceDetection', 'ComputerVision'],
    'FaceRecognition': ['FaceRecognition', 'ComputerVision'],
    'FeatureExtraction': ['FeatureExtraction', 'MachineLearning'],
    'FeatureSelection': ['FeatureSelection', 'MachineLearning'],
    'Flink': ['Flink', 'Apache', 'BigData'],
    'GAN': ['GAN', 'DeepLearning', 'MachineLearning', 'NeuralNet'],
    'Generali': ['Generalisation', 'MachineLearning'],
    'GenerativeModel': ['GenerativeModel', 'MachineLearning'],
    'GettingStarted': ['GettingStarted'],
    'Gradient': ['Gradient', 'MachineLearning'],
    'GradientBoosting': ['GradientBoosting', 'MachineLearning'],
    'GradientDescent': ['GradientDescent', 'MachineLearning'],
    'Guide': ['Guide'],
    'Hadoop': ['Hadoop', 'Apache', 'BigData'],
    'HowTo': ['HowTo', 'GettingStarted'],
    'HyperParameter': ['HyperParameter', 'MachineLearning'],
    'ImageAnalysis': ['ImageAnalysis', 'ComputerVision', 'MachineLearning'],
    'ImageCaptioning': ['ImageCaptioning', 'ComputerVision', 'MachineLearning'],
    'ImageClassification': ['ImageClassification', 'ComputerVision', 'MachineLearning'],
    'ImageProcessing': ['ImageProcessing', 'ComputerVision', 'MachineLearning'],
    'ImageRecognition': ['ImageRecognition', 'ComputerVision', 'MachineLearning'],
    'ImageSegmentation': ['ImageSegmentation', 'ComputerVision', 'MachineLearning'],
    'ImageUnderstanding': ['ImageUnderstanding', 'ComputerVision', 'MachineLearning'],
    'KMeans': ['KMeans', 'MachineLearning', 'Clustering'],
    'KNN': ['KNN', 'MachineLearning', 'Classification'],
    'Kaggle': ['Competition', 'Kaggle', 'MachineLearning'],
    'Keras': ['DeepLearning', 'Keras', 'MachineLearning', 'NeuralNet', 'Library'],
    'LDA': ['LDA', 'MachineLearning', 'FeatureSelection'],
    'LSTM': ['DeepLearning', 'LSTM', 'MachineLearning', 'RNN', 'NeuralNet'],
    'Lasagne': ['DeepLearning', 'Lasagne', 'MachineLearning','NeuralNet','Library'],
    'MNIST': ['MNIST'],
    'MXNet': ['DeepLearning', 'MXNet', 'MachineLearning', 'NeuralNet', 'Library'],
    'MachineLearning': ['MachineLearning'],
    'MariaDB': ['Database', 'MariaDB'],
    'Markov': ['MarkovChain', 'MachineLearning', 'NeuralNet'],
    'MissingValue': ['MissingValue'],
    'Model': ['Model'],
    'MongoDB': ['Database', 'MongoDB'],
    'MonteCarlo': ['MonteCarlo', 'Stochastic'],
    'MultiClass': ['MultiClass', 'MachineLearning'],
    'MultiLabel': ['MultiClass', 'MachineLearning'],
    'MySQL': ['Database', 'MySQL'],
    'NVidia': ['NVidia'],
    'Neo4J': ['Neo4J', 'Database', 'GraphDB'],
    'NLP': ['NLP', 'DeepLearning', 'MachineLearning'],
    'NonLinear': ['NonLinear', 'MachineLearning'],
    'NoSQL': ["NoSQL", 'Database', 'BigData'],
    'Numpy': ['Numpy', 'Python', 'Library'],
    'ObjectDetection': ['ObjectDetection', "ComputerVision", "DeepLearning", 'MachineLearning'],
    'ObjectRecognition': ['ObjectRecognition', "ComputerVision", "DeepLearning", 'MachineLearning'],
    'ObjectSegmentation': ['ObjectSegmentation', "ComputerVision", "DeepLearning", 'MachineLearning'],
    'Pandas': ['Pandas', 'Python', 'Library', 'DataManipulation'],
    'Pipeline': ['DataPipeline', 'MachineLearning'],
    'PostgreSQL': ['Database', 'PostgreSQL'],
    'Proba': ['Probability', 'Statistic'],
    'Propaga': ['BackPropagation', 'DeepLearning', 'NeuralNet'],
    'Python': ['Python'],
    'RSS': ['RSS', 'Feed'],
    'Recurrent': ['DeepLearning', 'MachineLearning', 'RNN', 'NeuralNet'],
    'SVM': ['MachineLearning', 'SVM', 'Classification'],
    'Scikit': ['Python', 'Scikit', 'MachineLearning', 'Library'],
    'Scipy': ['Python', 'Scipy', 'MachineLearning', 'Library'],
    'SelectFeatu': ['FeatureSelection', 'MachineLearning'],
    'Sentiment': ['SentimentAnalysis', 'NLP', 'MachineLearning'],
    'Spark': ['Apache', 'Spark', 'BigData'],
    'Streaming': ['Streaming', 'MachineLearning'],
    'TensorFlow': ['DeepLearning', 'MachineLearning', 'TensorFlow', 'NeuralNet', 'Library'],
    'TextMining': ['TextMining', 'MachineLearning'],
    'TextProcess': ['TextProcessing', 'MachineLearning'],
    'Theano': ['DeepLearning', 'MachineLearning', 'Theano', 'Library'],
    'Tutorial': ['Tutorial', 'GettingStarted'],
    'Visuali': ['DataViz', 'MachineLearning'],
    'vision': ['ComputerVision', 'MachineLearning'],
     
    # Engineering Oriented Tags 
    '3D': ['3D'],
    '3DModels': ['3DModels'],
    'ABTest': ['ABTest'],
    'API': ['API'],
    'AWS': ['AWS', 'Amazon', 'Cloud'],
    'AWSLambda': ['AWSLambda', 'ServerLess', 'AWS', 'Amazon', 'Cloud'],
    'AddOn': ['AddOn'],
    'Agile': ['Agile'],
    'Angular': ['Angular', 'JS', 'WebApp'],
    'AutoScaling': ['AutoScaling'],
    'Automation': ['Automation'],
    'Backup': ['Backup'],
    'BeautifulSoup': ['BeautifulSoup', 'Python'],
    'Benchmark': ['Benchmark'],
    'Botnet': ['Botnet', 'ITSecurity', 'InfoSec'],
    'C++': ['C++'],
    'CPU': ['CPU'],
    'CSS': ['CSS', 'WebApp'],
    'Cache': ['Caching'],
    'Caching': ['Caching'],
    'ChatBot': ['ChatBot'],
    'Collab': ['Collaborative'],
    'Container': ['Container'],
    'Design': ['Design'],
    'Development': ['Development'],
    'Django': ['Django', 'Python', 'WebApp', 'ORM'],
    'Docker': ['Docker', 'Container', 'Virtualisation', 'DataCenter', 'VM'],
    'Facebook': ['Facebook'],
    'Framework': ['Framework'],
    'FundRaising': ['FundRaising'],
    'Funding': ['Funding'],
    'GCE': ['GCE', 'Google', 'Cloud'],
    'GCP': ['GCP', 'Google', 'Cloud'],
    'GPU': ['GPU'],
    'Git': ['Git', 'OpenSource'],
    'Github': ['Github', 'Git', 'OpenSource'],
    'Gitlab': ['Gitlab', 'Git', 'OpenSource'],
    'GrowthHacking': ['GrowthHacking'],
    'HTML': ['HTML', 'WebApp'],
    'Handbook': ['Handbook'],
    'Haskell': ['Haskell'],
    'IPython': ['IPython', 'Notebook', 'Python'],
    'Industry': ['Industry'],
    'Interactive': ['Interactive'],
    'InternetOfThi': ['IoT'],
    'Interview': ['Interview'],
    'IoT': ['IoT'],
    'JQuery': ['JQuery', 'JS', 'WebApp'],
    'JS': ['JS', 'WebApp'],
    'JSON': ['JSON'],
    'Java': ['Java'],
    'Javascript': ['JS', 'WebApp'],
    'Jupyter': ['Jupyter', 'Notebook', 'Python'],
    'LaTeX': ['LaTeX'],
    'Microsoft': ['Microsoft'],
    'Migration': ['Migration', 'Database', 'ORM'],
    'MOOC': ['MOOC', 'Learning'],
    'MVP': ["MVP", 'Startup'],
    'NodeJS': ['NodeJS', 'JS', 'WebApp'],
    'Notebook': ['Notebook', 'Notebook', 'Python'],
    'OpenGL': ['OpenGL'],
    'OpenCV': ["OpenCV", 'ComputerVision', 'C++', 'Python'],
    'PHP': ['PHP', 'WebApp'],
    'REST': ['REST'],
    'RSS': ['RSS', 'Feed'],
    'Scala': ['Scala'],
    'ServerLess': ['ServerLess'],
    'VC': ['VC', 'FundRaising', 'Investor'],
    'WebGL': ['WebGL']
}