In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from collections import defaultdict
from tqdm import tqdm
from lifelines import KaplanMeierFitter
from matplotlib import cm

In [None]:
"""Initialization the root path"""

path = "./data/"

In [None]:
"""Reading the dataset snapshot"""

filename = f"{path}Posts_June2019.csv"
fulldata=pd.read_csv(filename)
fulldata.shape

In [None]:
"""Reading the cleaned data we used in our analysis"""

filename = f"{path}data.csv"
data = pd.read_csv(filename)
data.shape

In [None]:
"""Filtering out the closed questions' post ids"""

ques_index = data[data['Class']=='closed']['PostId'].to_list()
len(ques_index)

In [None]:
"""Filtering out all the answers for the closed questions filtered above"""

answer_data = fulldata[(fulldata.PostTypeId==2) & (fulldata.ParentId.isin(ques_index))]
answer_data.shape

In [None]:
"""Finding the first answer's creation date of the closed questions"""

clques_date = defaultdict(lambda: '')

for i in tqdm(range(len(answer_data))):
    post = answer_data.iloc[i]
    pid = post['ParentId']
    if clques_date[pid] == '':
        clques_date[pid] = post['CreationDate']
    else:
        clques_date[pid] = min(clques_date[pid], post['CreationDate'])

In [None]:
"""Adding column for first reply's creation date"""

closed = data[data['Class']=='closed'].reset_index()
closed['FirstReplyDate'] = closed['PostId'].map(clques_date)
closed['FirstReplyDate'] = closed['FirstReplyDate'].replace('', np.nan)
closed = closed.dropna(subset=['CreationDate'])
c1 = closed[~(closed['FirstReplyDate']<closed['CreationDate'])]

In [None]:
"""Removing some inconsistencies and questions which were reopened"""

c2 = c1[(c1.FirstReplyDate.isna()) | (c1.FirstReplyDate<c1.ClosedDate)]
reopened = pd.read_csv(f"{path}reopened_posts.csv")
c2 = c2[~(c2.PostId.isin(reopened.PostId))]
c2 = c2.dropna(subset=['ClosedDate'])
c2.shape

In [None]:
"""Making the final dataframe of events and timestamps"""

replies_te = pd.DataFrame()
replies_te['E'] = c2['FirstReplyDate'].apply(lambda x: 0 if x is np.nan else 1)

In [None]:
"""Adding a column for the time duration between getting first reply or getting closed"""

reply_durations = []
for row in tqdm(c2.itertuples()):
    if row.FirstReplyDate is np.nan:
        reply_durations.append(
            (pd.to_datetime(row.ClosedDate) - pd.to_datetime(row.CreationDate))/pd.Timedelta('1 hour')
        )
    else:
        reply_durations.append(
            (pd.to_datetime(row.FirstReplyDate) - pd.to_datetime(row.CreationDate))/pd.Timedelta('1 hour')
        )
replies_te['T'] = reply_durations

In [None]:
"""Adding a columns containing the tags and close reason for each post"""

replies_te['Reason'] = c2['Comment']
replies_te['Tags'] = c2['Tags']
replies_te['TagSet'] = np.nan

all_tags = []
for i in tqdm(range(len(replies_te[:]))):
    post = replies_te.iloc[i]
    tags = post['Tags']
    tags = tags[:-1]
    tags = tags.replace('<','')
    tags = tags.replace('>',' ')
    tags = list(map(str,tags.split()))
    all_tags.append(set(tags))
replies_te['TagSet'] = all_tags

In [None]:
"""Initializing the tags belonging to different categories"""

databases = ['mysql','postgresql','sqlite','mongodb','sql-server','redis','mariadb','firebase','elasticsearch','oracle']
cloud = ['amazon-web-services','google-cloud-platform','azure','heroku','digital-ocean','ibm-cloud','ibm-watson','oracle-cloud-infrastructure']
webframeworks = ['reactjs','jquery','express','angularjs','vue.js','asp.net-core','flask','asp.net','django','spring']
programming = ['javascript','html','css','python','sql','java','node.js','typescript','c#','bash','shell','c++']
others = ['.net','numpy','.net-core','pandas','tensorflow','react-native','flutter','keras','qt','torch','pytorch']

In [None]:
"""Categorizing function to identify different categories to which the question belongs"""

def check_tags(tags):
    categ = [0, 0, 0, 0, 0]
    if len(tags.intersection(databases))>0:
        categ[0] = 1
    if len(tags.intersection(cloud))>0:
        categ[1] = 1
    if len(tags.intersection(webframeworks))>0:
        categ[2] = 1
    if len(tags.intersection(programming))>0:
        categ[3] = 1
    if len(tags.intersection(others))>0:
        categ[4] = 1
    return categ

In [None]:
"""Assigning the identified categories to the questions"""

all_categ = []
for i in tqdm(range(len(replies_te[:]))):
    all_categ.append(check_tags(replies_te.iloc[i]['TagSet']))
all_categ = np.array(all_categ)
replies_te['databases'] = all_categ[:,0]
replies_te['cloud'] = all_categ[:,1]
replies_te['webframeworks'] = all_categ[:,2]
replies_te['programming'] = all_categ[:,3]
replies_te['others'] = all_categ[:,4]

In [None]:
"""Fitting Kaplan Meier on a given category"""

def plot_kmf(category, ax, color, title):
    if type(category) == type('a'):
        temp = replies_te[replies_te[category]==1][['T','E']]
    else:
        temp = replies_te[replies_te['Reason']==category][['T','E']]
    kmf.fit(temp['T'], temp['E'], label=category)
    ax.plot(np.log(kmf.survival_function_), label=title, color=color, linewidth=2.5)
    conf_interval = kmf.confidence_interval_survival_function_
    median_st = kmf.median_survival_time_
    print(category)
    print("Median: ", median_st)
    print(conf_interval[conf_interval.index==median_st])
    print()

In [None]:
"""Plotting the graph for all categories"""

kmf = KaplanMeierFitter() 
fig, (axr, axl) = plt.subplots(figsize=(12,4), ncols=2)
colors = [cm.tab10(i) for i in range(10)][::-1]
titles = ['Databases','Cloud','Web Frameworks', 'Programming languages','Others']

plot_kmf('databases', axl, colors[0], titles[0])
plot_kmf('cloud', axl, colors[1], titles[1])
plot_kmf('webframeworks', axl, colors[2], titles[2])
plot_kmf('programming', axl, colors[3], titles[3])
plot_kmf('others', axl, colors[4], titles[4])

axl.set_xlabel("First answer time (hours)",fontsize=12)
axl.tick_params('both', width=1, which='major', labelsize=10)
axl.set_ylabel("Survival Function, log(S(t))",fontsize=12)
leg = axl.legend(bbox_to_anchor=(1,1), loc='upper right', ncol=1, prop={'size':12}, title='Technologies')
leg._legend_box.align = "left"
leg.get_title().set_fontsize(12)

plot_kmf(102, axr, colors[5], 102)
plot_kmf(103, axr, colors[6], 103)
plot_kmf(104, axr, colors[7], 104)
plot_kmf(105, axr, colors[8], 105)

axr.set_xlabel("First answer time (hours)",fontsize=12)
axr.tick_params('both', width=1, which='major', labelsize=10)
axr.set_ylabel("Survival Function, log(S(t))",fontsize=12)
leg = axr.legend(bbox_to_anchor=(1,1), loc='upper right', ncol=1, prop={'size':12}, title='Closing Reason')
leg._legend_box.align = "left"
leg.get_title().set_fontsize(12)

In [None]:
"""Saving the plotted survival plot"""

fig.savefig("./survival_analysis.pdf", bbox_inches='tight')