In [6]:
import os

def split(filehandler, delimiter=',', row_limit=1500000, 
    output_name_template='output_Final%s.csv', output_path='.', keep_headers=True):
    """
    Splits a CSV file into multiple pieces.
    
    A quick bastardization of the Python CSV library.
    Arguments:
        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.
    Example usage:
    
        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));
    
    """
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = reader.next()
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [None]:
split(open('./train.csv', 'r'))

In [2]:
import csv
import numpy as np

def parseCSVData(file):
    data = []
    rowIndex = 0
    with open(file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            if rowIndex !=0:
                element = {}
                fieldnames = ("PostId","PostCreationDate","OwnerUserId","OwnerCreationDate","ReputationAtPostCreation","OwnerUndeletedAnswerCountAtPostTime","Title","BodyMarkdown","Tag1","Tag2","Tag3","Tag4","Tag5","PostClosedDate","OpenStatus")
                index = 0
                for i in row:
                    element[fieldnames[index]] = i
                    if index == 0 or index == 5 or index == 4:   
                        element[fieldnames[index]] = int(i)
                    index +=1


                data.append(element)
            else:
                rowIndex = 1
    return data


In [3]:
leData1 = parseCSVData('output_Final1.csv')

In [4]:
len(leData1)

1500000

In [5]:
from collections import defaultdict

def dataCount(ownerCount,tagCount,openStatusCount,dateClosed,data):
    tag1NotUsed = 0
    tag2NotUsed = 0
    tag3NotUsed = 0
    tag4NotUsed = 0
    tag5NotUsed = 0
    for i in data:
        ownerCount[i['OwnerUserId']]+=1
        openStatusCount[i['OpenStatus']]+=1
        if i['Tag1'] != '':
            tagCount[i['Tag1']]+=1
        else:
            tag1NotUsed+=1
            
        if i['Tag2'] != '':
            tagCount[i['Tag2']]+=1
        else:
            tag2NotUsed+=1
            
        if i['Tag3'] != '':
            tagCount[i['Tag3']]+=1
        else:
            tag3NotUsed+=1
            
        if i['Tag4'] != '':
            tagCount[i['Tag4']]+=1
        else:
            tag4NotUsed+=1
        if i['Tag5'] != '':
            tagCount[i['Tag5']]+=1
        else:
            tag5NotUsed+=1

        if i['PostClosedDate']!='':
            dateClosed+=1
        
    dateClosed = dateClosed
    return (tag1NotUsed,tag2NotUsed,tag3NotUsed,tag4NotUsed,tag5NotUsed)

In [6]:
ownerCount = defaultdict(int)
tagCount = defaultdict(int)
openStatusCount = defaultdict(int)
dateClosed = 0


tagNotUsed = dataCount(ownerCount,tagCount,openStatusCount,dateClosed,leData1)

In [117]:
totalTagNotUsed = 0
for i in tagNotUsed:
    totalTagNotUsed+=i

In [119]:
totalTagNotUsed, (1500000)*5 - totalTagNotUsed

(3245998, 4254002)

In [68]:
openStatusCount,dateClosed

(defaultdict(int,
             {'not a real question': 10449,
              'not constructive': 8891,
              'off topic': 6885,
              'open': 1471700,
              'too localized': 2075}),
 0)

In [69]:
totalNotClosed  = 0
closedQuestions = []
for x in openStatusCount:
    if x!= 'open':
        closedQuestions.append(x)
        totalNotClosed += openStatusCount[x]
totalNotClosed,dateClosed

(28300, 0)

In [70]:
totalNotClosed  = 0
closedQuestions = []
for x in leData1:
    if x['OpenStatus']!= 'open':
        closedQuestions.append(x)

In [88]:
maxInt = 0
maxDict = {}
maxDictByUser = defaultdict(int)
for x in closedQuestions:
    i = int(x['ReputationAtPostCreation'])
    maxDictByUser[x['ReputationAtPostCreation']] += 1
    if i>maxInt:
        maxInt = i
        maxDict = x
        
maxInt

123251

In [90]:
countOfReputationsLessThan200 = 0
for x in maxDictByUser:
    if x < 200:
        countOfReputationsLessThan200+=maxDictByUser[x]

In [91]:
countOfReputationsLessThan200

20055

### For most of the closed questions the reputations of the users are below 200 and the max is 123251. 

In [92]:
maxInt = 0
maxDict = {}
maxDictByUser = defaultdict(int)
for x in leData1:
    i = int(x['ReputationAtPostCreation'])
    maxDictByUser[x['ReputationAtPostCreation']] += 1
    if i>maxInt:
        maxInt = i
        maxDict = x
        
maxInt

308032

In [93]:
countOfReputationsLessThan200 = 0
for x in maxDictByUser:
    if x < 200:
        countOfReputationsLessThan200+=maxDictByUser[x]
countOfReputationsLessThan200

922954

### From the overall dataset also most of the users have reputations below 200, 922954/1400000

## Question, can the type of questions by the tag influence if the question is closed or not. Do some users tend to show bad questions than others in specific topics?

In [7]:
tagCount
import operator
sorted_x = sorted(tagCount.items(), key=operator.itemgetter(1))

In [8]:
sorted_x.reverse()
len(sorted_x)

37272

In [130]:
10.0*37272/100

3727.2

### Check if Most of reviews exist on the top 20%

In [131]:
tagCountReviews = 0
tempTags = sorted_x[0:3727]
tempTags
for i in tempTags:
    print i
    tagCountReviews += i[1]

('c#', 152024)
('java', 111581)
('php', 101636)
('javascript', 92341)
('jquery', 82452)
('android', 72665)
('iphone', 64996)
('asp.net', 63703)
('.net', 60992)
('c++', 59454)
('python', 51475)
('mysql', 43203)
('html', 42217)
('sql', 38110)
('objective-c', 36509)
('ruby-on-rails', 35471)
('css', 33271)
('c', 27459)
('wpf', 23494)
('ruby', 22704)
('ios', 21279)
('xml', 19821)
('sql-server', 19271)
('asp.net-mvc', 18513)
('ajax', 17514)
('database', 17306)
('windows', 16516)
('regex', 16190)
('django', 15405)
('linux', 14201)
('xcode', 14059)
('vb.net', 13655)
('arrays', 12615)
('eclipse', 12216)
('wcf', 11552)
('multithreading', 11465)
('silverlight', 11051)
('ruby-on-rails-3', 11040)
('winforms', 11003)
('visual-studio', 10910)
('string', 10694)
('performance', 10682)
('linq', 10462)
('facebook', 10450)
('cocoa', 10376)
('json', 10095)
('visual-studio-2010', 9771)
('flash', 9523)
('web-services', 9482)
('perl', 9474)
('flex', 9413)
('actionscript-3', 9372)
('osx', 9338)
('image', 8795)

In [132]:
tagCountReviews

3840536

In [133]:
4254002-tagCountReviews

413466

In [134]:
top10PercentTags = {}
for tag in tempTags:
    top10PercentTags[tag[0]] = tag[1]

### The 10% most popular tags are used on almost 90% of the reviews.

### Lets check if the most popular tags are used on the closed Questions. 


In [149]:
def averageTagsSpaceUsed(data):
    tagSpace = 0
    for x in data:
        thisQuestions = 0
        if x['Tag1'] != '':
            thisQuestions +=1
        if x['Tag2'] != '':
            thisQuestions +=1
        if x['Tag3'] != '':
            thisQuestions +=1
            
        if x['Tag4'] != '':
            thisQuestions +=1
        if x['Tag5'] != '':
            thisQuestions +=1
        
        tagSpace += 1.0*thisQuestions/5
    
    return (tagSpace)

In [151]:
averageTagsSpaceUsed(leData1)/(len(leData1))

0.5672002666670607

In [137]:


closedOwnerCount = defaultdict(int)
closedTagCount = defaultdict(int)
closedOpenStatusCount = defaultdict(int)
dateClosed = 0


tagNotUsed = dataCount(closedOwnerCount,closedTagCount,closedOpenStatusCount,dateClosed,closedQuestions)

In [142]:
totalClosedTagNotUsed = 0
for i in tagNotUsed:
    totalClosedTagNotUsed+=i
    
totalClosedTagNotUsed

69631

In [143]:
len(closedQuestions)*5

141500

In [144]:
141500 - 69631

71869

In [147]:
averageTagsSpaceUsed(closedQuestions)

0.5079081272084743

In [153]:
inTopTags  = 0
for i in closedTagCount:
    if i in top10PercentTags:
        inTopTags+=1

In [155]:
inTopTags, len(top10PercentTags)

(3133, 3727)

In [156]:
len(closedTagCount)

8168

In [9]:
sorted_x
words = [x[0] for x in sorted_x]

wordId = dict(zip(words, range(len(words))))
wordSet = set(words)
wordId

{'mdbg': 22779,
 'raining': 33014,
 'screen-resolution': 3434,
 'userscripts': 4033,
 'netcdf': 22777,
 'ssms-2005': 22776,
 'hanging': 8318,
 'ssms-2008': 20840,
 'database-installation': 25207,
 'localized': 16080,
 'windows-ce': 934,
 'errortext': 37271,
 'coreanimation': 9423,
 'diplomatic-terminology': 37270,
 'storing-data': 15087,
 'xunit': 4848,
 'sqlmanagmentstudio': 22775,
 'embedded-code': 16851,
 'dataflowtask': 15086,
 'boost.mpi': 37269,
 'numeric-analysis': 37268,
 'browser-compatibility': 1010,
 'mdb2': 8523,
 'war-files': 19253,
 'fractal': 15897,
 'logical-exclusive-or': 37267,
 'time-trigger': 37266,
 'rabbitmq': 1729,
 'riatest': 37265,
 'old-software': 28682,
 'eofexception': 13104,
 'e-notices': 25206,
 'uisearchbardisplaycontrol': 20832,
 'google-caja': 28680,
 'dynamics-crm': 1334,
 'feasibility': 17910,
 'rpsec': 16850,
 'repoze.who': 16849,
 'post-grad': 26923,
 'equestedith': 37264,
 'errors': 12578,
 'tiered': 35902,
 'usenet': 22774,
 'sequencial': 28683,
 

## After observing how important were th weights towards the tags, we can assume that the tag might be a good and important use 

### Linear Regressor using the tags, title and Reputation and postCount

In [10]:
# #Question 3: Polynomian with ABV ^4
# def feature4(datum):
#   feat = [1]
#   feat.append(datum['beer/ABV'])
#   feat.append(datum['beer/ABV']**(2))
#   feat.append(datum['beer/ABV']**(3))
#   feat.append(datum['beer/ABV']**(4))
#   return feat
#  = ("PostId","PostCreationDate","OwnerUserId","OwnerCreationDate","ReputationAtPostCreation","OwnerUndeletedAnswerCountAtPostTime","Title","BodyMarkdown","Tag1","Tag2","Tag3","Tag4","Tag5","PostClosedDate","OpenStatus")
# def feature(datum):
#   feat = [0]*len(words)
#   r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
#   for w in r.split():
#     if w in words:
#       feat[wordId[w]] += 1
#   feat.append(1) #offset
#   return feat

def feature(datum):
    feat  = [0]*len(words)
#     feat.append(datum['ReputationAtPostCreation'])
#     feat.append(datum['OwnerUndeletedAnswerCountAtPostTime'])
#     Adding the tags as features
    if datum['Tag1']!='':
        feat[wordId[datum['Tag1']]] = 1
    if datum['Tag2']!='':
        feat[wordId[datum['Tag2']]] = 1
    if datum['Tag3']!='':
        feat[wordId[datum['Tag3']]] = 1
    if datum['Tag4']!='':
        feat[wordId[datum['Tag4']]] = 1
    if datum['Tag5']!='':
        feat[wordId[datum['Tag5']]] = 1
    
    feat.append(datum['ReputationAtPostCreation'])
    feat.append(datum['OwnerUndeletedAnswerCountAtPostTime'])
    
    return feat

In [11]:
def result(datum):
    if datum['OpenStatus']!='open':
        return 1
    
    return 0

In [12]:
feat = feature(leData1[1])
len(feat)

37274

In [14]:
wordId['html'],feat[12]

(12, 1)

In [None]:
X = [feature(d) for d in leData1]