In [6]:
import os

def split(filehandler, delimiter=',', row_limit=1500000, 
    output_name_template='output_Final%s.csv', output_path='.', keep_headers=True):
    """
    Splits a CSV file into multiple pieces.
    
    A quick bastardization of the Python CSV library.
    Arguments:
        `row_limit`: The number of rows you want in each output file. 10,000 by default.
        `output_name_template`: A %s-style template for the numbered output files.
        `output_path`: Where to stick the output files.
        `keep_headers`: Whether or not to print the headers in each output file.
    Example usage:
    
        >> from toolbox import csv_splitter;
        >> csv_splitter.split(open('/home/ben/input.csv', 'r'));
    
    """
    import csv
    reader = csv.reader(filehandler, delimiter=delimiter)
    current_piece = 1
    current_out_path = os.path.join(
         output_path,
         output_name_template  % current_piece
    )
    current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
    current_limit = row_limit
    if keep_headers:
        headers = reader.next()
        current_out_writer.writerow(headers)
    for i, row in enumerate(reader):
        if i + 1 > current_limit:
            current_piece += 1
            current_limit = row_limit * current_piece
            current_out_path = os.path.join(
               output_path,
               output_name_template  % current_piece
            )
            current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
            if keep_headers:
                current_out_writer.writerow(headers)
        current_out_writer.writerow(row)

In [None]:
split(open('./train.csv', 'r'))

In [1]:
import csv

def parseCSVData(file):
    data = []
    rowIndex = 0
    with open(file, 'rb') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            if rowIndex !=0:
                element = {}
                fieldnames = ("PostId","PostCreationDate","OwnerUserId","OwnerCreationDate","ReputationAtPostCreation","OwnerUndeletedAnswerCountAtPostTime","Title","BodyMarkdown","Tag1","Tag2","Tag3","Tag4","Tag5","PostClosedDate","OpenStatus")
                index = 0
                for i in row:
                    element[fieldnames[index]] = i
                    if index == 0 or index == 5:   
                        element[fieldnames[index]] = int(i)
                    index +=1


                data.append(element)
            else:
                rowIndex = 1
    return data


In [25]:
leData1 = parseCSVData('output_Final3.csv')

In [26]:
len(leData1)

664856

In [27]:
from collections import defaultdict

def dataCount(ownerCount,tagCount,openStatusCount,dateClosed):
    for i in leData1:
        ownerCount[i['OwnerUserId']]+=1
        openStatusCount[i['OpenStatus']]+=1
        if i['Tag1'] != '':
            tagCount[i['Tag1']]+=1
        if i['Tag2'] != '':
            tagCount[i['Tag2']]+=1
        if i['Tag3'] != '':
            tagCount[i['Tag3']]+=1
        if i['Tag4'] != '':
            tagCount[i['Tag4']]+=1
        if i['Tag5'] != '':
            tagCount[i['Tag5']]+=1

        if i['PostClosedDate']!='':
            dateClosed+=1
        
    dateClosed = dateClosed
    

In [28]:
ownerCount = defaultdict(int)
tagCount = defaultdict(int)
openStatusCount = defaultdict(int)
dateClosed = 0

dataCount(ownerCount,tagCount,openStatusCount,dateClosed)

In [29]:
openStatusCount,dateClosed

(defaultdict(int,
             {'not a real question': 10369,
              'not constructive': 4165,
              'off topic': 5215,
              'open': 642309,
              'too localized': 2798}),
 0)

In [30]:
totalNotClosed  = 0
for x in openStatusCount:
    if x!= 'open':
        totalNotClosed += openStatusCount[x]
totalNotClosed,dateClosed

(22547, 0)

38329,22547