In [None]:
# https://www.kaggle.com/c/predict-closed-questions-on-stack-overflow/forums/t/3083/sharing-my-solution-ranked-10


import os

def split(filehandler, delimiter=',', row_limit=1500000, 
    output_name_template='output_Final%s.csv', output_path='.', keep_headers=True):
    """
    Splits a CSV file into multiple pieces.
    
    A quick bastardization of the Python CSV library.
    Arguments:
        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.
    Example usage:
    
        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));
    
    """
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = reader.next()
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [None]:
split(open('./train.csv', 'r'))

In [None]:
import csv
import numpy as np

def parseCSVData(file,data,maxCount):
    rowIndex = 0
    rowCount = 0
    with open(file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            if rowCount == maxCount:
                break
            rowCount+=1
            if rowIndex !=0:
                element = {}
                fieldnames = ("PostId","PostCreationDate","OwnerUserId","OwnerCreationDate","ReputationAtPostCreation","OwnerUndeletedAnswerCountAtPostTime","Title","BodyMarkdown","Tag1","Tag2","Tag3","Tag4","Tag5","PostClosedDate","OpenStatus")
                index = 0
                for i in row:
                    element[fieldnames[index]] = i
                    if index == 0 or index == 5 or index == 4:   
                        element[fieldnames[index]] = int(i)
                    index +=1


                data.append(element)
            else:
                rowIndex = 1
    return


In [None]:
leData1 = []
max = 100001
parseCSVData('output_Final1.csv',leData1,max)

In [None]:
len(leData1)

In [None]:
from collections import defaultdict

def dataCount(ownerCount,tagCount,openStatusCount,dateClosed,data):
    tag1NotUsed = 0
    tag2NotUsed = 0
    tag3NotUsed = 0
    tag4NotUsed = 0
    tag5NotUsed = 0
    for i in data:
        ownerCount[i['OwnerUserId']]+=1
        openStatusCount[i['OpenStatus']]+=1
        if i['Tag1'] != '':
            tagCount[i['Tag1']]+=1
        else:
            tag1NotUsed+=1
            
        if i['Tag2'] != '':
            tagCount[i['Tag2']]+=1
        else:
            tag2NotUsed+=1
            
        if i['Tag3'] != '':
            tagCount[i['Tag3']]+=1
        else:
            tag3NotUsed+=1
            
        if i['Tag4'] != '':
            tagCount[i['Tag4']]+=1
        else:
            tag4NotUsed+=1
        if i['Tag5'] != '':
            tagCount[i['Tag5']]+=1
        else:
            tag5NotUsed+=1

        if i['PostClosedDate']!='':
            dateClosed+=1
        
    dateClosed = dateClosed
    return (tag1NotUsed,tag2NotUsed,tag3NotUsed,tag4NotUsed,tag5NotUsed)

In [None]:
ownerCount = defaultdict(int)
tagCount = defaultdict(int)
openStatusCount = defaultdict(int)
dateClosed = 0


tagNotUsed = dataCount(ownerCount,tagCount,openStatusCount,dateClosed,leData1)

In [None]:
totalTagNotUsed = 0
for i in tagNotUsed:
    totalTagNotUsed+=i

In [None]:
totalTagNotUsed, (100000)*5 - totalTagNotUsed

In [None]:
openStatusCount,dateClosed

In [None]:
totalNotClosed  = 0
closedQuestions = []
for x in openStatusCount:
    if x!= 'open':
        closedQuestions.append(x)
        totalNotClosed += openStatusCount[x]
totalNotClosed,dateClosed

In [None]:
totalNotClosed  = 0
closedQuestions = []
for x in leData1:
    if x['OpenStatus']!= 'open':
        closedQuestions.append(x)

In [None]:
maxInt = 0
maxDict = {}
maxDictByUser = defaultdict(int)
for x in closedQuestions:
    i = int(x['ReputationAtPostCreation'])
    maxDictByUser[x['ReputationAtPostCreation']] += 1
    if i>maxInt:
        maxInt = i
        maxDict = x
        
maxInt

In [None]:
countOfReputationsLessThan200 = 0
for x in maxDictByUser:
    if x < 200:
        countOfReputationsLessThan200+=maxDictByUser[x]

In [None]:
countOfReputationsLessThan200

### For most of the closed questions the reputations of the users are below 200 and the max is 123251. 

In [None]:
maxInt = 0
maxDict = {}
maxDictByUser = defaultdict(int)
for x in leData1:
    i = int(x['ReputationAtPostCreation'])
    maxDictByUser[x['ReputationAtPostCreation']] += 1
    if i>maxInt:
        maxInt = i
        maxDict = x
        
maxInt

In [None]:
countOfReputationsLessThan200 = 0
for x in maxDictByUser:
    if x < 200:
        countOfReputationsLessThan200+=maxDictByUser[x]
countOfReputationsLessThan200

### From the overall dataset also most of the users have reputations below 200, 922954/1400000

## Question, can the type of questions by the tag influence if the question is closed or not. Do some users tend to show bad questions than others in specific topics?

In [None]:
tagCount
import operator
sorted_x = sorted(tagCount.items(), key=operator.itemgetter(1))

In [None]:
sorted_x.reverse()
len(sorted_x)

In [None]:
10.0*37272/100

### Check if Most of reviews exist on the top 20%

In [None]:
tagCountReviews = 0
tempTags = sorted_x[0:3727]
tempTags
for i in tempTags:
    print i
    tagCountReviews += i[1]

In [None]:
tagCountReviews

In [None]:
4254002-tagCountReviews

In [None]:
top10PercentTags = {}
for tag in tempTags:
    top10PercentTags[tag[0]] = tag[1]

### The 10% most popular tags are used on almost 90% of the reviews.

### Lets check if the most popular tags are used on the closed Questions. 


In [None]:
def averageTagsSpaceUsed(data):
    tagSpace = 0
    for x in data:
        thisQuestions = 0
        if x['Tag1'] != '':
            thisQuestions +=1
        if x['Tag2'] != '':
            thisQuestions +=1
        if x['Tag3'] != '':
            thisQuestions +=1
            
        if x['Tag4'] != '':
            thisQuestions +=1
        if x['Tag5'] != '':
            thisQuestions +=1
        
        tagSpace += 1.0*thisQuestions/5
    
    return (tagSpace)

In [None]:
averageTagsSpaceUsed(leData1)/(len(leData1))

In [None]:


closedOwnerCount = defaultdict(int)
closedTagCount = defaultdict(int)
closedOpenStatusCount = defaultdict(int)
dateClosed = 0


tagNotUsed = dataCount(closedOwnerCount,closedTagCount,closedOpenStatusCount,dateClosed,closedQuestions)

In [None]:
totalClosedTagNotUsed = 0
for i in tagNotUsed:
    totalClosedTagNotUsed+=i
    
totalClosedTagNotUsed

In [None]:
len(closedQuestions)*5

In [None]:
141500 - 69631

In [None]:
averageTagsSpaceUsed(closedQuestions)

In [None]:
inTopTags  = 0
for i in closedTagCount:
    if i in top10PercentTags:
        inTopTags+=1

In [None]:
inTopTags, len(top10PercentTags)

In [None]:
len(closedTagCount)

In [None]:
sorted_x
words = [x[0] for x in sorted_x]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
wordId

## After observing how important were th weights towards the tags, we can assume that the tag might be a good and important use 

### Linear Regressor using the tags, title and Reputation and postCount

In [None]:
# #Question 3: Polynomian with ABV ^4
# def feature4(datum):
#   feat = [1]
#   feat.append(datum['beer/ABV'])
#   feat.append(datum['beer/ABV']**(2))
#   feat.append(datum['beer/ABV']**(3))
#   feat.append(datum['beer/ABV']**(4))
#   return feat
#  = ("PostId","PostCreationDate","OwnerUserId","OwnerCreationDate","ReputationAtPostCreation","OwnerUndeletedAnswerCountAtPostTime","Title","BodyMarkdown","Tag1","Tag2","Tag3","Tag4","Tag5","PostClosedDate","OpenStatus")
# def feature(datum):
#   feat = [0]*len(words)
#   r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
#   for w in r.split():
#     if w in words:
#       feat[wordId[w]] += 1
#   feat.append(1) #offset
#   return feat

def feature(datum):
    feat  = [0]*len(words)
#     feat.append(datum['ReputationAtPostCreation'])
#     feat.append(datum['OwnerUndeletedAnswerCountAtPostTime'])
#     Adding the tags as features
    if datum['Tag1']!='':
        feat[wordId[datum['Tag1']]] = 1
    if datum['Tag2']!='':
        feat[wordId[datum['Tag2']]] = 1
    if datum['Tag3']!='':
        feat[wordId[datum['Tag3']]] = 1
    if datum['Tag4']!='':
        feat[wordId[datum['Tag4']]] = 1
    if datum['Tag5']!='':
        feat[wordId[datum['Tag5']]] = 1
    
    feat.append(datum['ReputationAtPostCreation'])
    feat.append(datum['OwnerUndeletedAnswerCountAtPostTime'])
    
    return feat

In [None]:
def result(datum):
    if datum['OpenStatus']!='open':
        return 1
    
    return 0

In [None]:
feat = feature(leData1[1])
len(feat)

In [None]:
wordId['html'],feat[12]

In [None]:
X = [feature(d) for d in leData1]

In [None]:
y = [result(d) for d in leData1]

In [None]:
len(leData1), len(X), len(y)

In [None]:
leData1 = ''
len(leData1)

In [None]:
from sklearn import linear_model

In [None]:
clf = linear_model.LogisticRegression()

In [None]:
clf.fit(X,y)

In [None]:
predictions = clf.predict(X)

In [None]:
len(leData1)


In [None]:
print predictions

In [None]:
index = 0
countRight = 0
for x in predictions:
    if y[index] == x:
        countRight+=1
    
    index+=1
countRight

In [None]:
len(predictions)