In [33]:
import pyspark as ps
import warnings
from pyspark.sql import *
from pyspark.sql import functions as F


In [2]:
try:
    # we try to create a SparkContext to work locally on all cpus available
    sc = ps.SparkContext('local[4]')
    print("Just created a SparkContext")
except ValueError:
    # give a warning if SparkContext already exists (for use inside pyspark)
    warnings.warn("SparkContext already exists in this scope")

  import sys


In [3]:
sc

### Load in the data into two tables:

In [75]:
df = spark.read.csv('csvs/posts.csv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep=",",           # char for separation
                         inferSchema=True)  # do we infer schema or not ?
df = df.selectExpr("`postId:ID(Post)` as post_id", 'title as title', 'body as body')


In [5]:
tags = spark.read.csv('csvs/tags_posts_rel.csv',
                         header=True,       # use headers or not
                         quote='"',         # char for quotes
                         sep=",",           # char for separation
                         inferSchema=True)

In [76]:
df.show(2)

+-------+--------------------+--------------------+
|post_id|               title|                body|
+-------+--------------------+--------------------+
|      4|While applying op...|<p>I want to use ...|
|      6|Percentage width ...|<p>I have an abso...|
+-------+--------------------+--------------------+
only showing top 2 rows



In [7]:
tags.show(2)

+---------------+-------------------+
|:START_ID(Post)|       :END_ID(Tag)|
+---------------+-------------------+
|              4|                 c#|
|              4|           winforms|
|              4|    type-conversion|
|              4|            decimal|
|              4|            opacity|
|              6|               html|
|              6|                css|
|              6|               css3|
|              6|internet-explorer-7|
|              9|                 c#|
+---------------+-------------------+
only showing top 10 rows



In [8]:
df.count()

39646923

In [9]:
df = 
df.printSchema()

root
 |-- postId:ID(Post): integer (nullable = true)
 |-- title: string (nullable = true)
 |-- body: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- views: integer (nullable = true)
 |-- comments: integer (nullable = true)



In [13]:
tags.printSchema()

root
 |-- :START_ID(Post): integer (nullable = true)
 |-- :END_ID(Tag): string (nullable = true)



### Going to first take a look at the tags and see how many we are working with here

In [62]:
#lets first rename the ugly columns

tags = tags.selectExpr("`:START_ID(Post)` as post_id", "`:END_ID(Tag)` as tag")
tags.show(2)
tags.printSchema()

+-------+--------+
|post_id|     tag|
+-------+--------+
|      4|      c#|
|      4|winforms|
+-------+--------+
only showing top 2 rows

root
 |-- post_id: integer (nullable = true)
 |-- tag: string (nullable = true)



Lets select only tags that have over 50k posts, that should give us plenty to work with

In [63]:
tags.registerTempTable('post_tags')

In [64]:
tags_with_50k = spark.sql("""
                    SELECT tag
                    FROM post_tags
                    GROUP BY tag
                    HAVING COUNT(post_id) > 50000
                    """)
tags_with_50k.registerTempTable('top_tags')


In [65]:
tags_with_50k.show(100)

+-------------------+
|                tag|
+-------------------+
|                 qt|
|             iphone|
|              xcode|
|              azure|
|            android|
|          algorithm|
|         postgresql|
|            angular|
|           winforms|
|         powershell|
|            node.js|
|                 c#|
|     multithreading|
|            cordova|
|                api|
|             vb.net|
|        google-maps|
|             spring|
|               html|
|            asp.net|
|               perl|
|               linq|
|    sql-server-2008|
|            mongodb|
|              excel|
|           database|
|              forms|
|               json|
|               ruby|
|             pandas|
|              mysql|
|   entity-framework|
|              macos|
|       web-services|
|               ajax|
|              html5|
|  twitter-bootstrap|
|              loops|
|             jquery|
|         javascript|
|            eclipse|
|                css|
|amazon-we

Now that we have all those tags lets fish out relevant ids and posts we will be working with this will become our data to analyze

In [78]:
##Lets fish out posts with tags with over 50k
inner_join = spark.sql("""
                    SELECT DISTINCT post_tags.post_id as post_id
                    FROM post_tags
                    INNER JOIN top_tags
                    ON post_tags.tag = top_tags.tag
                    """)


In [79]:
inner_join.show(2)

+-------+
|post_id|
+-------+
|1048379|
|1460361|
+-------+
only showing top 2 rows



In [81]:
df.registerTempTable('all_posts')
inner_join.registerTempTable('relevant_ids')
relevant_posts = spark.sql("""
                            SELECT all_posts.post_id, all_posts.title, all_posts.body
                            FROM all_posts
                            INNER JOIN relevant_ids
                            ON all_posts.post_id = relevant_ids.post_id
                            """)
relevant_posts.show(5)

+-------+--------------------+--------------------+
|post_id|               title|                body|
+-------+--------------------+--------------------+
|    833|Editing database ...|<p>I have designe...|
|   1829|How do I make a m...|<p>I've got a men...|
|   6658|     JUnit vs TestNG|<p>At work we are...|
|   7880|How do you open a...|<p>I want to open...|
|   9376|ILMerge Best Prac...|<p>Do you use ILM...|
+-------+--------------------+--------------------+
only showing top 5 rows



### CREATE a NEW TABLE

In [None]:
create_query = '''
CREATE TABLE posts (
    postid int PRIMARY KEY,
    posttype int,
    body text,
    tags text)
'''
cur.execute(create_query)

### INSERT DATA

In [None]:
sample_file = ('data/Posts.xml')
data = ET.iterparse(sample_file)

In [None]:
insert_query = "INSERT INTO posts (postid, posttype, body, tag) VALUES {};"

In [None]:
# lis="google"
# stri= "whoa {}"
# stri.format(lis)

In [None]:
for event, elem in data:
    id_data = elem.attrib.get('Id')
    posttypeid = elem.attrib.get('PostTypeId')
    body = elem.attrib.get('Body'), 
    tags = elem.attrib.get('Tags')
    if tags:
        cleaned_tag = re.sub(r'<','', tags)
        tags = cleaned_tag.split('>')[:-1]
        tags = ' '.join(tags)
    cur.mogrify("""INSERT INTO posts (postid, posttype, body, tags) VALUES ({}, {}, {}, {})""".format(id_data,posttypeid,body, tags))


In [None]:
data = ('6', '1', '<p>I have an absolutely positioned <code>div</code> containing several children, one of which is a relatively positioned <code>div</code>. When I use a <strong>percentage-based width</strong> on the child <code>div</code>, it collapses to \'0\' width on <a href="http://en.wikipedia.org/wiki/Internet_Explorer_7" rel="noreferrer">Internet&nbsp;Explorer&nbsp;7</a>, but not on Firefox or Safari.</p>\n\n<p>If I use <strong>pixel width</strong>, it works. If the parent is relatively positioned, the percentage width on the child works.</p>\n\n<ol>\n<li>Is there something I\'m missing here?</li>\n<li>Is there an easy fix for this besides the <em>pixel-based width</em> on the\nchild?</li>\n<li>Is there an area of the CSS specification that covers this?</li>\n</ol>\n', '<html><css><css3><internet-explorer-7>')

In [None]:
data = ('6', '1', $$'<p>I have an absolutely positioned <code>div</code> containing several children, one of which is a relatively positioned <code>div</code>. When I use a <strong>percentage-based width</strong> on the child <code>div</code>, it collapses to \'0\' width on <a href="http://en.wikipedia.org/wiki/Internet_Explorer_7" rel="noreferrer">Internet&nbsp;Explorer&nbsp;7</a>, but not on Firefox or Safari.</p>\n\n<p>If I use <strong>pixel width</strong>, it works. If the parent is relatively positioned, the percentage width on the child works.</p>\n\n<ol>\n<li>Is there something I\'m missing here?</li>\n<li>Is there an easy fix for this besides the <em>pixel-based width</em> on the\nchild?</li>\n<li>Is there an area of the CSS specification that covers this?</li>\n</ol>\n$$', '<html><css><css3><internet-explorer-7>'$$)
data

In [None]:
query = """INSERT INTO posts (postid, posttype, body, tags) VALUES ${}$, ${}$, ${}$, ${}$""".format(id_data, posttypeid, body, tags)

In [None]:


cur.execute("""INSERT INTO posts (postid, posttype, body, tags) VALUES ('10', '1', $$'<p>I have an absolutely positioned <code>div</code> containing several children, one of which is a relatively positioned <code>div</code>. When I use a <strong>percentage-based width</strong> on the child <code>div</code>, it collapses to \'0\' width on <a href="http://en.wikipedia.org/wiki/Internet_Explorer_7" rel="noreferrer">Internet&nbsp;Explorer&nbsp;7</a>, but not on Firefox or Safari.</p>\n\n<p>If I use <strong>pixel width</strong>, it works. If the parent is relatively positioned, the percentage width on the child works.</p>\n\n<ol>\n<li>Is there something I\'m missing here?</li>\n<li>Is there an easy fix for this besides the <em>pixel-based width</em> on the\nchild?</li>\n<li>Is there an area of the CSS specification that covers this?</li>\n</ol>\n'$$, $$'<html><css><css3><internet-explorer-7>'$$)""")


id_data

In [None]:
cur.execute("""SELECT COUNT (*) from posts""")
cur.fetchall()

In [None]:
# cur.execute("""INSERT INTO posts (postid, posttype, body, tags) VALUES ({}, {}, $$'{}'$$, $$'{}'$$)""".format(11,1,body, tags))
#inserted 5 extra characters in begining, 6 extrac characters at tend

In [None]:
body