In [1]:
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from pandas.plotting import scatter_matrix
import re
from dataexplorer import make_post_dataframe, tag_cleaner, get_schema, read_xml, infer_schema
from datetime import date, datetime
%matplotlib inline




In [2]:
sample_file = ('startups.stackexchange.com/Posts.xml')
schema = get_schema("QueryResults.csv")
print(schema)



column_name
Id                            int
PostTypeId                tinyint
AcceptedAnswerId              int
ParentId                      int
CreationDate             datetime
DeletionDate             datetime
Score                         int
ViewCount                     int
Body                     nvarchar
OwnerUserId                   int
OwnerDisplayName         nvarchar
LastEditorUserId              int
LastEditorDisplayName    nvarchar
LastEditDate             datetime
LastActivityDate         datetime
Title                    nvarchar
Tags                     nvarchar
AnswerCount                   int
CommentCount                  int
FavoriteCount                 int
ClosedDate               datetime
CommunityOwnedDate       datetime
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [3]:
data = read_xml(sample_file)
df = make_post_dataframe(
                        data,
                        schema,
                        col_to_clean = 'Tags'
                        )


In [81]:
df = infer_schema(df, schema)
df.head(2).T

Unnamed: 0,0,1
Id,1,1
PostTypeId,1,1
AcceptedAnswerId,300,300
ParentId,,
CreationDate,2014-07-30 00:00:00,2014-07-30 00:00:00
DeletionDate,NaT,NaT
Score,21,21
ViewCount,1458,1458
Body,"<p>After registering my small LLC, I signed up...","<p>After registering my small LLC, I signed up..."
OwnerUserId,9,9


In [5]:
# lets clean up the CreationDate 
df['CreationDate'] = pd.DatetimeIndex(data=df['CreationDate']).date


In [50]:
#number of posts without accepted answers
len(df['Id'][df['AcceptedAnswerId'].isnull()].unique())


7256

In [51]:
#number of records
len(df['Id'].unique())

8518

In [100]:
answers = df[['Id','OwnerUserId','Score','ParentId', 'CreationDate']][df['PostTypeId']==2]
answers.head()
# df['Id'][df['PostTypeId']==2].values

Unnamed: 0,Id,OwnerUserId,Score,ParentId,CreationDate
8,5.0,20.0,1.0,2.0,2014-07-30
13,8.0,9.0,5.0,2.0,2014-07-30
14,9.0,27.0,10.0,1.0,2014-07-30
15,10.0,12.0,8.0,2.0,2014-07-30
16,11.0,28.0,1.0,2.0,2014-07-30


In [113]:
df['Tags'][df['Id']==1].values

array(['llc', 'new-hampshire', 'united-states'], dtype=object)

In [165]:
def get_user_answer_df(df, accepted_answers):
    all_data = []
    new_columns = ['UserId', 'PostId', 'Score', 'Date_posted', 'AcceptedAnswer?', 'ParentId', 'ParentScore', 'Tag']
    answer_ids = df['Id'][df['PostTypeId']==2].unique()
    
    for answer_id in answer_ids:
        answer_ref_df = df[df['Id'] == answer_id]
        parent_id = answer_ref_df['ParentId'].values[0]
        question_df = df[df['Id'] == parent_id]
        
        row_data = {}
        row_data['UserId'] = answer_ref_df['OwnerUserId'].values[0]
        row_data['PostId'] = answer_id
        row_data['Score'] = answer_ref_df['Score'].values[0]
        row_data['Date_posted'] = answer_ref_df['CreationDate'].values[0]
        row_data['AcceptedAnswer?'] = answer_id in accepted_answers
        row_data['ParentId'] = answer_ref_df['ParentId'].values[0]
        row_data['ParentScore'] = question_df['Score'].values[0]
        tags = question_df['Tags'].values

        
        for tag in tags:
            tag_label = {'Tag': tag}
            row_data = {**row_data, **tag_label}
            all_data.append(row_data)
    
    master_data = pd.DataFrame(
                           data=all_data,
                           columns=new_columns)


    return(master_data)
    

In [166]:
test = get_user_answer_df(df, accepted_answer_ids)

In [167]:
test[test['Tag'].isnull()==False]

Unnamed: 0,UserId,PostId,Score,Date_posted,AcceptedAnswer?,ParentId,ParentScore,Tag
0,20.0,5.0,1.0,2014-07-30,False,2.0,42.0,tech-company
1,20.0,5.0,1.0,2014-07-30,False,2.0,42.0,mobile-apps
2,20.0,5.0,1.0,2014-07-30,False,2.0,42.0,equity
3,9.0,8.0,5.0,2014-07-30,False,2.0,42.0,tech-company
4,9.0,8.0,5.0,2014-07-30,False,2.0,42.0,mobile-apps
5,9.0,8.0,5.0,2014-07-30,False,2.0,42.0,equity
6,27.0,9.0,10.0,2014-07-30,False,1.0,21.0,llc
7,27.0,9.0,10.0,2014-07-30,False,1.0,21.0,new-hampshire
8,27.0,9.0,10.0,2014-07-30,False,1.0,21.0,united-states
9,12.0,10.0,8.0,2014-07-30,False,2.0,42.0,tech-company


In [92]:
#getting the posts labeled as AcceptedAnswers
answer_ids = pd.DataFrame(df['AcceptedAnswerId'][df['AcceptedAnswerId'].isnull()==False].unique(), 
                          columns=['Accepted'])
print(len(answer_ids))
answer_ids.head()

accepted_answer_ids = set(df['AcceptedAnswerId'][df['AcceptedAnswerId'].isnull()==False].unique())


1262


In [132]:
df_accept_answers = pd.merge(df, answer_ids, left_on='Id', right_on='Accepted', how = 'right')
df_accept_answers['ParentId']


0          19.0
1          24.0
2           7.0
3           2.0
4          17.0
5          14.0
6          45.0
7          52.0
8          66.0
9          64.0
10         69.0
11         72.0
12         80.0
13         78.0
14         89.0
15        118.0
16        124.0
17        104.0
18         70.0
19         87.0
20        121.0
21        134.0
22        140.0
23        145.0
24        141.0
25        157.0
26         13.0
27         91.0
28        165.0
29        169.0
         ...   
1232    13415.0
1233    13417.0
1234    13428.0
1235    13450.0
1236    13453.0
1237    13454.0
1238    13419.0
1239    13469.0
1240    13475.0
1241    13527.0
1242    13526.0
1243    13545.0
1244    13547.0
1245    13540.0
1246    13558.0
1247    13567.0
1248    13570.0
1249    13608.0
1250    11909.0
1251    13249.0
1252    13634.0
1253    13635.0
1254    13629.0
1255    13678.0
1256    13686.0
1257    13685.0
1258    13692.0
1259    13684.0
1260    13645.0
1261    13712.0
Name: ParentId, Length: 

In [49]:
# #dataframe of all the answers
# answers = df[df['PostTypeId']==2]
# answers[['CreationDate','ParentId','OwnerUserId']].head()

In [None]:
# answer_ids = pd.DataFrame(df['AcceptedAnswerId'].unique())
# answer_ids
# df['Id']
# df1=pd.merge(df, answer_ids, left_on='Id', right_on=0, how = 'inner')
# # df['Id'].value_counts()["300"]
# df1['Id'].value_counts()[300]

In [46]:
df[df['AcceptedAnswerId'].isnull()].head().T

Unnamed: 0,6,7,8,9,13
Id,3,4,5,6,8
PostTypeId,5,4,2,1,2
AcceptedAnswerId,,,,,
ParentId,,,2,,2
CreationDate,2014-07-30 00:00:00,2014-07-30 00:00:00,2014-07-30 00:00:00,2014-07-30 00:00:00,2014-07-30 00:00:00
DeletionDate,NaT,NaT,NaT,NaT,NaT
Score,0,0,1,5,5
ViewCount,,,,134,
Body,<p>Mobile applications can be classed as </p>\...,For questions specifically pertaining to runni...,<p>It depends on what kind of a Mobile Applica...,"<p>In short, for my very first business plan w...","<p>If you only need a designer for one app, it..."
OwnerUserId,3045,3045,20,16,9


In [8]:
# answer_ids = pd.DataFrame(df['AcceptedAnswerId'].unique())
# answer_ids
# df['Id']
# df1=pd.merge(df, answer_ids, left_on='Id', right_on=0, how = 'inner')
# # df['Id'].value_counts()["300"]
# df1['Id'].value_counts()[300]

In [9]:
# values =df['Id'].value_counts()
# # values
# values["4024"]

In [10]:
# df.describe()
# number_columns=['ViewCount', 'AnswerCount', 'FavoriteCount', 'Score']
# date_columns = ['CreationDate', 'LastActivityDate']

In [11]:
# df.groupby(['Tags'])[number_columns].sum()

In [12]:
# counts =df.groupby(['Tags'])['Id'].count()
# counts.sort_values(ascending=False)

In [13]:
# df_tags.Count.describe()

In [14]:
# schema = pd.read_csv("QueryResults.csv")
# schema['column_name'][0] = 'Id'
# schema[['column_name', 'data_type']]
# schema = pd.Series(data = schema['data_type'].values, index=schema['column_name'])
# b = schema.index
# a = schema.index.drop('Tags')
# b

In [15]:
# schema.values

In [16]:
# post_files = 'startups.stackexchange.com/Posts.xml'
# posts_parsed = ET.parse(post_files)
# nodes = posts_parsed.getroot()
# labels = schema.values
# row_values = []

# df_posts = pd.DataFrame(columns=row_values)

# for node in nodes:
#     raw_tags = node.attrib.get('Tags')
#     if raw_tags:
#         cleaned_tags = re.sub(r'<','', raw_tags)
#         tags = cleaned_tags.split('>')[:-1]
#     else:
#         tags = []
#     values =[]
#     for column in labels:
#         if column == 'Tags':
#             pass
#         column_value = node.attrib.get(column)
#         values.append(column_value)
            
            
                
            
            
#         column_value = node.attrib.get(column)
#         values.append(column_value)
# #     print("\n Appending: \n {} \n".format(values))
#     df_posts = df_posts.append(pd.Series(values, index = labels),ignore_index=True)
    
# df_posts.head()


    

In [17]:
# df_posts
# df_posts.describe().T

In [18]:
# df_posts['Tags'].fillna(value=np.nan, inplace=True)

In [19]:
# untagged_posts = df_posts['Tags'][df_posts['Tags'].isnull()]
# len(untagged_posts)/len(df_posts['Tags'])

In [20]:
# df_posts['ViewCount'] = df_posts['ViewCount'].astype(float)
# df_posts['AnswerCount'] = df_posts['AnswerCount'].astype(float)
# df_posts['FavoriteCount'] = df_posts['FavoriteCount'].astype(float)
# df_posts['Score'] = df_posts['Score'].astype(float)
# df_posts['CreationDate'] = pd.to_datetime(df_posts['CreationDate'])
# df_posts['LastActivityDate'] = pd.to_datetime(df_posts['LastActivityDate'])



In [21]:
# initial_columns = ['ViewCount', 'AnswerCount', 'FavoriteCount', 'Score', 'CreationDate', 'LastActivityDate', 'Tags']

In [22]:
# EDA_shit = df_posts[initial_columns]

In [23]:
# EDA_shit.describe().T

In [24]:
# EDA_untagged = EDA_shit[EDA_shit['Tags'].isnull()]
# EDA_tagged = EDA_shit[EDA_shit['Tags'].isnull()==False]

In [25]:
# EDA_untagged.drop('Tags',axis=1).describe()

In [26]:
# EDA_tagged.describe()

In [27]:
# scatter_matrix(EDA_tagged, alpha = 0.6, figsize = (15, 15), diagonal ='kde')
# None

In [28]:
# EDA_tagged.head()