In [1]:
import pandas as pd
import numpy as np
import os
import re

from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
valid = pd.read_csv('valid.csv')
train = pd.read_csv('train.csv')

data = pd.concat([valid, train])
data["CreationDate"] = pd.to_datetime(data["CreationDate"])
data.head()

In [3]:
pattern = r"<([^<>]*)>"
def to_lst(tag):
    return re.findall(pattern, tag)
data["Tags"] = data["Tags"].apply(to_lst)
data

In [15]:
fig = px.histogram(data["CreationDate"], 
                   labels={
                     "value": "Date",
                     "count": "Count"
                 },
                   title="Number of Posts from December 2015 - March 2020")
fig.write_html('Number of Posts from December 2015 - March 2020.html', include_plotlyjs='cdn')
fig

In [5]:
def clean_df(df):
    df['Body_processed'] = df['Body'].map(lambda x: re.sub(r'(?s)(?<=<code>)(.*?)(?=<\/code>)','', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('<.*?>', '', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('[^\w\s]', '', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('\\n+', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub('\/', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: re.sub(' [ ]+', ' ', x))
    df['Body_processed'] = df['Body_processed'].map(lambda x: x.lower())
    
    stop = set(stopwords.words('english'))
    df['Body_processed'] = df['Body_processed'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
clean_df(data)
data.head()

In [6]:
tags = data["Tags"].sum()

In [7]:
tags_vc = pd.Series(tags).value_counts() #10703
len(tags_vc[tags_vc.values > 1]) #5774
len(tags_vc[tags_vc.values > 2]) #4155
len(tags_vc[tags_vc.values > 3]) #3283
toptags = tags_vc[tags_vc.values > 1000] #23
toptags

In [8]:
fig = px.bar(toptags,
             labels={
                     "index": "Tag",
                     "value": "Count"
                 },
             title="Count of the Most Common Tags",
             color_discrete_sequence=[px.colors.qualitative.Safe[4]])
fig.write_html('Count of the Most Common Tags.html', include_plotlyjs='cdn')
fig

In [9]:
datetag = pd.DataFrame()
datetag["Date"] = data["CreationDate"].dt.to_period('M')#.transform(lambda x: x.date())
datetag["Tags"] = data["Tags"]
datetag

In [10]:
for i in toptags.index:
    datetag[i] = datetag["Tags"].apply(lambda x: i in x).astype(int)

datetag.head()

In [11]:
dtgb = datetag.groupby("Date").sum().reset_index()
dtgb

In [12]:
dtgb["sDate"] = dtgb["Date"].astype(str)

In [13]:
tdtgb = dtgb.T
tdtgb

In [14]:
fig = px.bar(dtgb, x = "sDate",
             y = list(toptags.index),
            labels={
                     "sDate": "Date",
                     "value": "Count"
                 },
             title="Bar Graph Visualizing the Distribution of the Most Common Tags per Month")
fig.write_html('Bar Graph Visualizing the Distribution of the Most Common Tags per Month.html', include_plotlyjs='cdn')
fig