# Exploratory Analysis
### Random code exploring the structure of the Stack Exchange answer Data

In [40]:
import pandas as pd
import numpy as np
import math
import re, string

## Check to see what % of questions have tag in content

### Define couning functions

In [2]:
def count_occur(tags, content):
    return sum([1 if tag in content else 0 for tag in tags])

def calc_tags_occur(df, name):
    df["tags_l"] = df["tags"].apply(lambda tags: tags.split(" "))
    df["numTags"] =  df["tags_l"].apply(lambda tags_list: len(tags_list))
    df["numTagsFound"] =  df.apply(lambda row: count_occur(row["tags_l"], row["content"]), axis=1)
    tags, occur = (sum(df["numTags"]), sum(df["numTagsFound"]))
    print("{0} has occurence percentage: {1}".format(name, occur/tags))
    return (tags, occur)

### Read and agg all datas

In [3]:
cooking = pd.read_csv("../dat/cooking.csv")
diy = pd.read_csv("../dat/diy.csv")
travel = pd.read_csv("../dat/travel.csv")
biology = pd.read_csv("../dat/biology.csv")
robotics = pd.read_csv("../dat/robotics.csv")
crypto = pd.read_csv("../dat/crypto.csv")
dfs = [(cooking, "cooking"), (diy, "diy"), (travel, "travel"), (biology, "biology"),
       (robotics, "robotics"), (crypto, "crypto")]

### Run counting funcs on all dfs

In [4]:
tags_occurs_all = [calc_tags_occur(df, name) for df, name in dfs]

cooking has occurence percentage: 0.4960047268020933
diy has occurence percentage: 0.44397842006460453
travel has occurence percentage: 0.1246977071662534
biology has occurence percentage: 0.16861360137643755
robotics has occurence percentage: 0.3179447852760736
crypto has occurence percentage: 0.20922932035787162


### Calculate Overall Percentage

In [5]:
tags, occur = [sum(x) for x in zip(*tags_occurs_all)]
print("Overall occurence percentage is {0}".format(occur/tags))

Overall occurence percentage is 0.28879620499427017


## Do the same for lemmatized/split words

In [45]:
from nltk.stem.porter import PorterStemmer
def calc_tags_occur_lemma(df, name):
    porter_stemmer = PorterStemmer()
    # split tags on space
    df["tags_l"] = df["tags"].apply(lambda tags: tags.split(" "))
    
    # some tags are compounds w/ "-" connectors, split those
    df["tags_l"] = df["tags_l"].apply(lambda tags: [tag.split("-") for tag in tags])
    
    # lemmatize each word in each tag, join into space delimited string
    df["tags_l"] = df["tags_l"].apply(lambda tag_list: lemmatize_tags(tag_list))
    
    # calculate the number of tags for each row
    df["numTags"] =  df["tags_l"].apply(lambda tags_list: len(tags_list))
    
    # clean out some ugly html 
    df["cleaned_content"] = df["content"].apply(lambda text: clean_content(text))
    
    # lemmatize each paragraph
    df["lemma_content"] = df["cleaned_content"].apply(lambda text: lemmatize_content(text))
    
    # calculate the # of occurances 
    df["numTagsFound"] =  df.apply(lambda row: count_occur(row["tags_l"], row["lemma_content"]), axis=1)
    
    # determine ratio
    tags, occur = (sum(df["numTags"]), sum(df["numTagsFound"]))
    print("{0} has occurence percentage: {1}".format(name, occur/tags))
    return (tags, occur)

def lemmatize_content(content):
    # remove puncuation
    no_punc = "".join([char for char in content if char not in string.punctuation])
    return " ".join([porter_stemmer.stem(w) for w in no_punc.split(" ")])

def lemmatize_tags(tag_list):
    lemma_list = [porter_stemmer.stem(tword) for tag in tag_list for tword in tag]
    return " ".join(lemma_list)

def clean_content(content):
    cleaned = remove_p_tags(content)
    cleaned = parse_a_tags(content)
    return cleaned

def remove_p_tags(content):
    return re.sub(r'\<\/?p\>', '', content).strip()

def parse_a_tags(content):
    return re.sub(r"\<a[^\>]*\>([^\<\>]+)\<\/a\>", r"\1", content)

In [46]:
tags_occurs_all = [calc_tags_occur_lemma(df, name) for df, name in dfs]

cooking has occurence percentage: 0.9935405311376551
diy has occurence percentage: 0.9971228557132202
travel has occurence percentage: 0.9904021579555011
biology has occurence percentage: 0.9899536656129563
robotics has occurence percentage: 0.9950026837439153
crypto has occurence percentage: 0.993843866287989


In [47]:
tags, occur = [sum(x) for x in zip(*tags_occurs_all)]
print("Overall occurence percentage is {0}".format(occur/tags))

Overall occurence percentage is 0.9928754076242737


## So lemmatizing and breaking the tags apart _really_ helps