In [1]:
import pandas as pd
import re

In [2]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    return re.sub(cleanr, '', raw_html)

In [3]:
def removeCharacters(text):
    bannedChars = ["\n","\r","\t","!","?"]
    for char in bannedChars:
        while char in text:
            text = text.replace(char, " ")
    return text

In [4]:
q = pd.read_csv('Questions.csv', encoding='ISO-8859-1')
#Get only relevant columns
q = q[["Id", "Score", "Title", "Body"]]
#Remove html tags
q['Body'] = q['Body'].apply(cleanhtml)
#Join Title and body
q["Title"] = q["Title"].map(str) + " " + q["Body"]
#Delete redundant body
q = q[["Id", "Score", "Title"]]

In [5]:
#Rename Title
q = q.rename(index=str, columns={"Title": "Question"})
#Set index to id
q = q.set_index('Id')
q.head()

Unnamed: 0_level_0,Score,Question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
80,26,SQLStatement.execute() - multiple queries in o...
90,144,Good branching and merging tutorials for Torto...
120,21,ASP.NET Site Maps Has anyone got experience cr...
180,53,Function for creating color wheels This is som...
260,49,Adding scripting functionality to .NET applica...


In [6]:
#Remove bad questions
q = q[q["Score"] > -1]
#Remove invalid characters
q["Question"] = q["Question"].apply(removeCharacters)
q.head()

Unnamed: 0_level_0,Score,Question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
80,26,SQLStatement.execute() - multiple queries in o...
90,144,Good branching and merging tutorials for Torto...
120,21,ASP.NET Site Maps Has anyone got experience cr...
180,53,Function for creating color wheels This is som...
260,49,Adding scripting functionality to .NET applica...


# Answers

In [7]:
a = pd.read_csv('Answers.csv', encoding='ISO-8859-1')
#Keep only relevant columns
a = a[["ParentId","Score","Body"]]

In [8]:
#remove html
a['Body'] = a['Body'].apply(cleanhtml)
#remove invalid characters
a['Body'] = a['Body'].apply(removeCharacters)
a.head()

Unnamed: 0,ParentId,Score,Body
0,90,13,Version Control with Subversion A very good...
1,80,12,"I wound up using this. It is a kind of a hack,..."
2,180,1,I've read somewhere the human eye can't distin...
3,260,4,"Yes, I thought about that, but I soon figured ..."
4,260,28,Oleg Shilo's C# Script solution (at The Code P...


In [9]:
def getFunctions(text):
    words = text.split(" ")
    functions = ""
    for word in words:
        if "(" in word and ")" in word:
            functions += word +","
    return functions[0:len(functions)-2]

In [10]:
def getURLs(text):
    urls = re.findall("(?P<url>https?://[^\s]+)", text)
    urlText = ""
    for url in urls:
        while True:
            if not url[len(url)-1].isdigit() and not url[len(url)-1].isalpha():
                url = url[0:len(url)-2]
            else:
                break
        urlText += url+","
    return urlText[0:len(urlText)-2]

In [11]:
a['URL'] = a['Body'].apply(getURLs)
a['Function'] = a['Body'].apply(getFunctions)
a.head()

Unnamed: 0,ParentId,Score,Body,URL,Function
0,90,13,Version Control with Subversion A very good...,,
1,80,12,"I wound up using this. It is a kind of a hack,...",,"stream.readUTFBytes(stream.bytesAvailable);,st..."
2,180,1,I've read somewhere the human eye can't distin...,,"Random();//assumes,(len(colors),rand.next(256)..."
3,260,4,"Yes, I thought about that, but I soon figured ...",,(DSL
4,260,28,Oleg Shilo's C# Script solution (at The Code P...,http://www.codeplex.com/Nu,


# Delete bad answers, delete answers without questions and visa versa

In [12]:
a = a[a["Score"] > -1]

#check all questions have an answer
q = q[q.index.isin(a["ParentId"])]
q.head()

Unnamed: 0_level_0,Score,Question
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
80,26,SQLStatement.execute() - multiple queries in o...
90,144,Good branching and merging tutorials for Torto...
120,21,ASP.NET Site Maps Has anyone got experience cr...
180,53,Function for creating color wheels This is som...
260,49,Adding scripting functionality to .NET applica...


In [13]:
#check answer has a question
a = a[a["ParentId"].isin(q.index)]
a.head()

Unnamed: 0,ParentId,Score,Body,URL,Function
0,90,13,Version Control with Subversion A very good...,,
1,80,12,"I wound up using this. It is a kind of a hack,...",,"stream.readUTFBytes(stream.bytesAvailable);,st..."
2,180,1,I've read somewhere the human eye can't distin...,,"Random();//assumes,(len(colors),rand.next(256)..."
3,260,4,"Yes, I thought about that, but I soon figured ...",,(DSL
4,260,28,Oleg Shilo's C# Script solution (at The Code P...,http://www.codeplex.com/Nu,


In [14]:
a.to_csv('cleaned_answers.csv')
q.to_csv('cleaned_questions.csv')

KeyboardInterrupt: 